From: jacob Date: Sun, 19 Dec 2004 23:20:10 +0000 (+0000) Subject: Split portals into its own CVS module (b1_4) X-Git-Tag: v1_8_0_110~486^5~76 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=b25c4c4344a90c317853a30ca267ac65f1233b53;p=fs%2Flustre-release.git Split portals into its own CVS module (b1_4) --- diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index 33a6ad7..a31e01a 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -29,8 +29,8 @@ linux/ldiskfs%.h: linux-stage/include/linux/ext3%.h # FIXME: we need to grab the series in configure somehow # (see bug 1679) # -series := @top_srcdir@/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) -patches := @top_srcdir@/kernel_patches/patches +series := @top_srcdir@/lustre/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) +patches := @top_srcdir@/lustre/kernel_patches/patches sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) rm -rf linux-stage linux sources $(ldiskfs_SOURCES) diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 59849db..137220d 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -1,5 +1,3 @@ -subdir-m += portals - @LDISKFS_TRUE@subdir-m += ldiskfs subdir-m += lvfs diff --git a/lustre/README b/lustre/README deleted file mode 100644 index c052124..0000000 --- a/lustre/README +++ /dev/null @@ -1,2 +0,0 @@ -Instructions for building, configuring and running Lustre can be found at: - http://projects.clusterfs.com/lustre/LustreHowto. diff --git a/lustre/README.kernel-source b/lustre/README.kernel-source deleted file mode 100644 index 0b88efe1..0000000 --- a/lustre/README.kernel-source +++ /dev/null @@ -1,74 +0,0 @@ -Lustre can currently build against Red Hat 2.4-style kernel-source -RPMs. All other kernel-source RPMs are *unsupported* at this time. - -Note that a Lustre-patched kernel is required for building Lustre; in -most cases a kernel-source RPM from your Linux vendor will not contain -the necessary patches. - -1. kernel.h - -Building against a kernel-source RPM requires a special header. On -Red Hat systems, this file should be automatically created at boot -time, and saved in /boot/kernel.h. - - *** If you are not running Red Hat Linux, or are not booted into the - *** kernel you are trying to build against, you need to create this - *** file manually. - - *** If you do not, the Lustre build may fail, or may fail to build - *** modules that work with your kernel. - -Here is an example /boot/kernel.h file. If you are building on -x86_64, the first defines should be __MODULE_KERNEL_x86_64, etc. The -other defines should be simple to figure out. - -/* This file is automatically generated at boot time. */ -#ifndef __BOOT_KERNEL_H_ -#define __BOOT_KERNEL_H_ - -/* Kernel type i686-smp */ - -#ifndef __MODULE_KERNEL_i686 -#define __MODULE_KERNEL_i686 1 -#endif - -#ifndef __BOOT_KERNEL_ENTERPRISE -#define __BOOT_KERNEL_ENTERPRISE 0 -#endif - -#ifndef __BOOT_KERNEL_BIGMEM -#define __BOOT_KERNEL_BIGMEM 0 -#endif - -#ifndef __BOOT_KERNEL_HUGEMEM -#define __BOOT_KERNEL_HUGEMEM 0 -#endif - -#ifndef __BOOT_KERNEL_SMP -#define __BOOT_KERNEL_SMP 1 -#endif - -#ifndef __BOOT_KERNEL_UP -#define __BOOT_KERNEL_UP 0 -#endif - -#endif - -You should save this somewhere, and pass the location of this file to -./configure using the --with-kernel-source-header option. - -2. .config - -You will also need to tell Lustre about the .config file for your -kernel. The two likely locations of this file are -/boot/config-$(uname -r), and /usr/src/linux-2.4/configs/. You should -pass the location of this file to Lustre using the --with-linux-config -option. - -3. An Example - -Here is an example for configuring Lustre: - -./configure --with-linux=/usr/src/linux-2.4.20-28.9_lustre.1.0.3 \ ---with-kernel-source-header=/boot/kernel.h \ ---with-linux-config=/boot/config-2.4.20-28.9_lustre.1.0.3smp diff --git a/lustre/Rules.in b/lustre/Rules.in deleted file mode 100644 index 293ff3c..0000000 --- a/lustre/Rules.in +++ /dev/null @@ -1,46 +0,0 @@ -# Directories building kernel modules should have two files: -# -# Makefile.in: -# -# MODULES := -# -objs := file1.o file2.o file3.o -# @INCLUDE_RULES@ -# -# and autoMakefile.am: -# -# if LIBLUSTRE -# -# endif -# -# if MODULES -# modulefs_DATA = $(KMODEXT) -# endif -# -# DIST_SOURCES = $(-objs:.o=.c) -# MOSTLYCLEANFILES = *.o *.ko *.mod.c - -ifeq ($(PATCHLEVEL),) - -include autoMakefile - -else - -include @LINUX_CONFIG@ - -EXTRA_CFLAGS := $(EXTRA_PRE_CFLAGS) -EXTRA_CFLAGS += @EXTRA_KCFLAGS@ @UML_CFLAGS@ -EXTRA_CFLAGS += $(EXTRA_POST_CFLAGS) - -obj-m := $(patsubst %,%.o,$(MODULES)) - -ifeq ($(PATCHLEVEL),4) -# 2.4 rules -O_TARGET := $(firstword $(obj-m)) -obj-y := $($(firstword $(MODULES))-objs) -export-objs := $(obj-y) $(filter-out $(O_TARGET),$(obj-m)) -include $(TOPDIR)/Rules.make -$(MODINCL)/%.ver: %.c - @true -endif # PATCHLEVEL - -endif # KERNELRELEASE diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index f8b503c..f36ca06 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -5,46 +5,11 @@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = . include portals ldiskfs lvfs obdclass lov ldlm ptlrpc \ - obdecho osc mdc mds obdfilter ost llite cobd ptlbd snapfs smfs \ - liblustre doc utils tests conf scripts +SUBDIRS = include ldiskfs lvfs obdclass lov ldlm ptlrpc \ + obdecho osc mdc mds obdfilter ost llite cobd ptlbd snapfs smfs \ + liblustre doc utils tests conf scripts autoconf -EXTRA_DIST = BUGS FDL Rules.in kernel_patches kernel-tests/Makefile \ - README.kernel-source - -# these empty rules are needed so that automake doesn't add its own -# recursive rules -etags-recursive: - -ctags-recursive: - -tags-recursive: - -TAGS: - -tags: - rm -f $(top_srcdir)/TAGS - ETAGSF=`etags --version | grep -iq exuberant && \ - echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ - find $(top_srcdir) -name '*.[hc]' | xargs etags $$ETAGSF -a - - rm -f $(top_srcdir)/tags - CTAGSF=`ctags --version | grep -iq exuberant && \ - echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ - find $(top_srcdir) -name '*.[hc]' | xargs ctags $$CTAGSF -a - -if MODULES -all-am: modules - -if !LINUX25 -DEP = dep -dep: .depend - -.depend: $(LDISKFS) lvfs-sources - $(MAKE) $(ARCH_UM) CC="$(CC)" -C $(LINUX_OBJ) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) -o scripts -o include/config/MARKER _sfdep_$(PWD) _FASTDEP_ALL_SUB_DIRS="$(PWD)" - -CLEANFILES = .depend -endif +EXTRA_DIST = BUGS FDL kernel_patches README.kernel-source if LDISKFS LDISKFS = ldiskfs-sources @@ -55,27 +20,17 @@ endif lvfs-sources: $(MAKE) sources -C lvfs -modules: lustre_build_version $(DEP) $(LDISKFS) lvfs-sources - $(MAKE) $(ARCH_UM) CC="$(CC)" -C $(LINUX_OBJ) -f $(PWD)/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$(LINUX_CONFIG) $(MODULE_TARGET)=$(PWD) -o tmp_include_depends -o scripts -o include/config/MARKER $@ - -endif # MODULES +sources: $(LDISKFS) lvfs-sources lustre_build_version all-recursive: lustre_build_version lustre_build_version: - perl $(top_builddir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver + perl $(top_builddir)/lustre/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver echo "#define LUSTRE_RELEASE @RELEASE@" >> tmpver - cmp -s $(top_builddir)/include/linux/lustre_build_version.h tmpver \ + cmp -s $(top_builddir)/lustre/include/linux/lustre_build_version.h tmpver \ 2> /dev/null && \ $(RM) tmpver || \ - mv tmpver $(top_builddir)/include/linux/lustre_build_version.h - -dist-hook: - find $(distdir) -name .deps | xargs rm -rf - find $(distdir) -name CVS | xargs rm -rf - -rpms: dist Makefile - rpmbuild -ta $(distdir).tar.gz + mv tmpver $(top_builddir)/lustre/include/linux/lustre_build_version.h CSTK=/tmp/checkstack CSTKO=/tmp/checkstack.orig diff --git a/lustre/autogen.sh b/lustre/autogen.sh deleted file mode 100644 index e1c2c6c..0000000 --- a/lustre/autogen.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# taken from gnome-common/macros2/autogen.sh -compare_versions() { - ch_min_version=$1 - ch_actual_version=$2 - ch_status=0 - IFS="${IFS= }"; ch_save_IFS="$IFS"; IFS="." - set $ch_actual_version - for ch_min in $ch_min_version; do - ch_cur=`echo $1 | sed 's/[^0-9].*$//'`; shift # remove letter suffixes - if [ -z "$ch_min" ]; then break; fi - if [ -z "$ch_cur" ]; then ch_status=1; break; fi - if [ $ch_cur -gt $ch_min ]; then break; fi - if [ $ch_cur -lt $ch_min ]; then ch_status=1; break; fi - done - IFS="$ch_save_IFS" - return $ch_status -} - -error_msg() { - echo "$cmd is $1. version $required is required to build Lustre." - - if [ -e /usr/lib/autolustre/bin/$cmd ]; then - cat >&2 <<-EOF - You apparently already have Lustre-specific autoconf/make RPMs - installed on your system at /usr/lib/autolustre/share/$cmd. - Please set your PATH to point to those versions: - - export PATH="/usr/lib/autolustre/bin:\$PATH" - EOF - else - cat >&2 <<-EOF - CFS provides RPMs which can be installed alongside your - existing autoconf/make RPMs, if you are nervous about - upgrading. See - - ftp://ftp.lustre.org/pub/other/autolustre/README.autolustre - - You may be able to download newer version from: - - http://ftp.gnu.org/gnu/$cmd/$cmd-$required.tar.gz - EOF - fi - [ "$cmd" = "autoconf" -a "$required" = "2.57" ] && cat >&2 <&2 </dev/null ; then - error_msg "missing" - fi - version=$($cmd --version | awk "BEGIN { IGNORECASE=1 } /$tool \(GNU $tool\)/ { print \$4 }") - echo "found $version" - if ! compare_versions "$required" "$version" ; then - error_msg "too old" - fi -} - -check_version automake automake-1.7 "1.7.8" -check_version autoconf autoconf "2.57" -echo "Running aclocal..." -aclocal-1.7 -echo "Running autoheader..." -autoheader -echo "Running automake..." -automake-1.7 -a -c -echo "Running autoconf..." -autoconf - diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am index 2a3f201..f2c2d76 100644 --- a/lustre/include/Makefile.am +++ b/lustre/include/Makefile.am @@ -5,5 +5,5 @@ # See the file COPYING in this distribution SUBDIRS = linux lustre -EXTRA_DIST = config.h.in ioctl.h liblustre.h +EXTRA_DIST = ioctl.h liblustre.h diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 5f76d28..2b97e36 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -115,7 +115,6 @@ static inline int cleanup_group_info(void) } while(0) #define kiobuf bio -#define smp_num_cpus num_online_cpus() #include diff --git a/lustre/ldiskfs/autoMakefile.am b/lustre/ldiskfs/autoMakefile.am index 33a6ad7..a31e01a 100644 --- a/lustre/ldiskfs/autoMakefile.am +++ b/lustre/ldiskfs/autoMakefile.am @@ -29,8 +29,8 @@ linux/ldiskfs%.h: linux-stage/include/linux/ext3%.h # FIXME: we need to grab the series in configure somehow # (see bug 1679) # -series := @top_srcdir@/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) -patches := @top_srcdir@/kernel_patches/patches +series := @top_srcdir@/lustre/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) +patches := @top_srcdir@/lustre/kernel_patches/patches sources: $(ext3_sources) $(ext3_headers) $(linux_headers) $(series) rm -rf linux-stage linux sources $(ldiskfs_SOURCES) diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am index f75fcab..a8a5545 100644 --- a/lustre/liblustre/tests/Makefile.am +++ b/lustre/liblustre/tests/Makefile.am @@ -22,27 +22,27 @@ libtestcommon_a_SOURCES = test_common.c test_common.h echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c echo_test_CFLAGS = $(LL_CFLAGS) echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread -echo_test_DEPENDENCIES=$(top_builddir)/liblustre/liblsupport.a +echo_test_DEPENDENCIES=$(top_builddir)/lustre/liblustre/liblsupport.a sanity_SOURCES = sanity.c sanity_CFLAGS = $(LL_CFLAGS) sanity_LDADD := ./libtestcommon.a $(LLIB_EXEC) -sanity_DEPENDENCIES = $(top_builddir)/liblustre/liblustre.a ./libtestcommon.a +sanity_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a ./libtestcommon.a recovery_small_SOURCES = recovery_small.c recovery_small_CFLAGS = $(LL_CFLAGS) recovery_small_LDADD := ./libtestcommon.a $(LLIB_EXEC) -recovery_small_DEPENDENCIES = $(top_builddir)/liblustre/liblustre.a +recovery_small_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a replay_single_SOURCES = replay_single.c replay_single_CFLAGS = $(LL_CFLAGS) replay_single_LDADD := ./libtestcommon.a $(LLIB_EXEC) -replay_single_DEPENDENCIES = $(top_builddir)/liblustre/liblustre.a +replay_single_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a replay_ost_single_SOURCES = replay_ost_single.c replay_ost_single_CFLAGS = $(LL_CFLAGS) replay_ost_single_LDADD := ./libtestcommon.a $(LLIB_EXEC) -replay_ost_single_DEPENDENCIES = $(top_builddir)/liblustre/liblustre.a +replay_ost_single_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a if MPITESTS test_lock_cancel_SOURCES = test_lock_cancel.c diff --git a/lustre/lvfs/autoMakefile.am b/lustre/lvfs/autoMakefile.am index 955e460..0b642f7 100644 --- a/lustre/lvfs/autoMakefile.am +++ b/lustre/lvfs/autoMakefile.am @@ -35,6 +35,10 @@ ldiskfs_sed_flags = \ fsfilt_ldiskfs.c: fsfilt_ext3.c sed $(strip $(ldiskfs_sed_flags)) $< > $@ +else + +sources: + endif # MODULES DIST_SOURCES = fsfilt.c fsfilt_ext3.c fsfilt_reiserfs.c lvfs_common.c \ diff --git a/lustre/portals/.cvsignore b/lustre/portals/.cvsignore deleted file mode 100644 index f30d862..0000000 --- a/lustre/portals/.cvsignore +++ /dev/null @@ -1,11 +0,0 @@ -Kernelenv -Makefile -autoMakefile -autoMakefile.in -aclocal.m4 -autom4te.cache -config.log -config.status -configure -.*.cmd -.depend diff --git a/lustre/portals/AUTHORS b/lustre/portals/AUTHORS deleted file mode 100644 index e69de29..0000000 diff --git a/lustre/portals/ChangeLog b/lustre/portals/ChangeLog deleted file mode 100644 index e69de29..0000000 diff --git a/lustre/portals/Kernelenv.in b/lustre/portals/Kernelenv.in deleted file mode 100644 index 7a48c58..0000000 --- a/lustre/portals/Kernelenv.in +++ /dev/null @@ -1,6 +0,0 @@ -EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include -# portals/utils/debug.c wants from userspace. sigh. -HOSTCFLAGS := -I@LINUX@/include $(EXTRA_CFLAGS) -LIBREADLINE := @LIBREADLINE@ -# 2.5's makefiles aren't nice to cross dir libraries in host programs -PTLCTLOBJS := debug.o l_ioctl.o parser.o portals.o diff --git a/lustre/portals/Kernelenv.mk b/lustre/portals/Kernelenv.mk deleted file mode 100644 index 7c66dfa..0000000 --- a/lustre/portals/Kernelenv.mk +++ /dev/null @@ -1,4 +0,0 @@ -EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include -HOSTCFLAGS := $(EXTRA_CFLAGS) -# the kernel doesn't want us to build archives for host binaries :/ -PTLCTLOBJS := debug.o l_ioctl.o parser.o portals.o diff --git a/lustre/portals/Makefile.in b/lustre/portals/Makefile.in deleted file mode 100644 index 71d0dc8..0000000 --- a/lustre/portals/Makefile.in +++ /dev/null @@ -1,9 +0,0 @@ -subdir-m += libcfs - -cray-subdirs += portals -cray-subdirs += knals -cray-subdirs += router -cray-subdirs += tests -@CRAY_PORTALS_FALSE@subdir-m += $(cray-subdirs) - -@INCLUDE_RULES@ diff --git a/lustre/portals/Makefile.mk b/lustre/portals/Makefile.mk deleted file mode 100644 index 73a19df..0000000 --- a/lustre/portals/Makefile.mk +++ /dev/null @@ -1,12 +0,0 @@ -include $(src)/Kernelenv - -# The ordering of these determines the order that each subsystem's -# module_init() functions are called in. if these are changed make sure -# they reflect the dependencies between each subsystem's _init functions. -obj-y += libcfs/ -obj-y += portals/ -obj-y += router/ -obj-y += knals/ -obj-y += tests/ - -obj-m += utils/ diff --git a/lustre/portals/NEWS b/lustre/portals/NEWS deleted file mode 100644 index e69de29..0000000 diff --git a/lustre/portals/README b/lustre/portals/README deleted file mode 100644 index e69de29..0000000 diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 deleted file mode 100644 index 2f1d5636..0000000 --- a/lustre/portals/archdep.m4 +++ /dev/null @@ -1,902 +0,0 @@ -# -------- we can't build modules unless srcdir = builddir -if test x$enable_modules != xno ; then - AC_CHECK_FILE([autoMakefile.am],[], - [AC_MSG_ERROR([At this time, Lustre does not support building kernel modules with srcdir != buildir.])]) -fi - -# -------- in kernel compilation? (2.5 only) ------------- -AC_MSG_CHECKING([if inkernel build support is requested]) -AC_ARG_ENABLE([inkernel], - AC_HELP_STRING([--enable-inkernel], - [set up 2.5 kernel makefiles]), - [],[enable_inkernel=no]) -AC_MSG_RESULT([$enable_inkernel]) -AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) - -# -------- are we building against an external portals? ------- -AC_MSG_CHECKING([for Cray portals]) -AC_ARG_WITH([cray-portals], - AC_HELP_STRING([--with-cray-portals=path], - [path to cray portals]), - [ - if test "$with_cray_portals" != no; then - CRAY_PORTALS_PATH=$with_cray_portals - CRAY_PORTALS_INCLUDES="$with_cray_portals/include" - CRAY_PORTALS_LIBS="$with_cray_portals" - fi - ],[with_cray_portals=no]) -AC_SUBST(CRAY_PORTALS_PATH) -AC_MSG_RESULT([$CRAY_PORTALS_PATH]) - -AC_MSG_CHECKING([for Cray portals includes]) -AC_ARG_WITH([cray-portals-includes], - AC_HELP_STRING([--with-cray-portals-includes=path], - [path to cray portals includes]), - [ - if test "$with_cray_portals_includes" != no; then - CRAY_PORTALS_INCLUDES="$with_cray_portals_includes" - fi - ]) -AC_SUBST(CRAY_PORTALS_INCLUDES) -AC_MSG_RESULT([$CRAY_PORTALS_INCLUDES]) - -AC_MSG_CHECKING([for Cray portals libs]) -AC_ARG_WITH([cray-portals-libs], - AC_HELP_STRING([--with-cray-portals-libs=path], - [path to cray portals libs]), - [ - if test "$with_cray_portals_libs" != no; then - CRAY_PORTALS_LIBS="$with_cray_portals_libs" - fi - ]) -AC_SUBST(CRAY_PORTALS_LIBS) -AC_MSG_RESULT([$CRAY_PORTALS_LIBS]) - -if test x$CRAY_PORTALS_INCLUDES != x ; then - if test ! -r $CRAY_PORTALS_INCLUDES/portals/api.h ; then - AC_MSG_ERROR([Cray portals headers were not found in $CRAY_PORTALS_INCLUDES. Please check the paths passed to --with-cray-portals or --with-cray-portals-includes.]) - fi -fi -if test x$CRAY_PORTALS_LIBS != x ; then - if test ! -r $CRAY_PORTALS_LIBS/libportals.a ; then - AC_MSG_ERROR([Cray portals libraries were not found in $CRAY_PORTALS_LIBS. Please check the paths passed to --with-cray-portals or --with-cray-portals-libs.]) - fi -fi - -AC_MSG_CHECKING([whether to use Cray portals]) -if test x$CRAY_PORTALS_INCLUDES != x -a x$CRAY_PORTALS_LIBS != x ; then - with_cray_portals=yes - AC_DEFINE(CRAY_PORTALS, 1, [Building with Cray Portals]) - CRAY_PORTALS_INCLUDES="-I$CRAY_PORTALS_INCLUDES" -else - with_cray_portals=no -fi -AC_MSG_RESULT([$with_cray_portals]) -AM_CONDITIONAL(CRAY_PORTALS, test x$with_cray_portals != xno) - -# ---------------------------------------- -# some tests for catamount-like systems -# ---------------------------------------- -AC_ARG_ENABLE([sysio_init], - AC_HELP_STRING([--disable-sysio-init], - [call sysio init functions when initializing liblustre]), - [],[enable_sysio_init=yes]) -AC_MSG_CHECKING([whether to initialize libsysio]) -AC_MSG_RESULT([$enable_sysio_init]) -if test x$enable_sysio_init != xno ; then - AC_DEFINE([INIT_SYSIO], 1, [call sysio init functions]) -fi - -AC_ARG_ENABLE([urandom], - AC_HELP_STRING([--disable-urandom], - [disable use of /dev/urandom for liblustre]), - [],[enable_urandom=yes]) -AC_MSG_CHECKING([whether to use /dev/urandom for liblustre]) -AC_MSG_RESULT([$enable_urandom]) -if test x$enable_urandom != xno ; then - AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data]) -fi - -# -------- check for -lcap and -lpthread ---- -if test x$enable_liblustre = xyes ; then - AC_CHECK_LIB([cap], [cap_get_proc], - [ - CAP_LIBS="-lcap" - AC_DEFINE([HAVE_LIBCAP], 1, [use libcap]) - ], - [CAP_LIBS=""]) - AC_SUBST(CAP_LIBS) - AC_CHECK_LIB([pthread], [pthread_create], - [ - PTHREAD_LIBS="-lpthread" - AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) - ], - [PTHREAD_LIBS=""]) - AC_SUBST(PTHREAD_LIBS) -fi - -# -------- enable tests and utils? ------- -if test x$enable_tests = xno ; then - AC_MSG_NOTICE([disabling tests]) - enable_tests=no -fi -if test x$enable_utils = xno ; then - AC_MSG_NOTICE([disabling utilities]) - enable_utils=no -fi - -if test x$enable_modules != xno ; then - # -------- set linuxdir ------------ - AC_MSG_CHECKING([for Linux sources]) - AC_ARG_WITH([linux], - AC_HELP_STRING([--with-linux=path], - [set path to Linux source (default=/usr/src/linux)]), - [LINUX=$with_linux], - [LINUX=/usr/src/linux]) - AC_MSG_RESULT([$LINUX]) - AC_SUBST(LINUX) - if test x$enable_inkernel = xyes ; then - echo ln -s `pwd` $LINUX/fs/lustre - rm $LINUX/fs/lustre - ln -s `pwd` $LINUX/fs/lustre - fi - - # -------- linux objects (for 2.6) -- - AC_MSG_CHECKING([for Linux objects dir]) - AC_ARG_WITH([linux-obj], - AC_HELP_STRING([--with-linux-obj=path], - [set path to Linux objects dir (default=\$LINUX)]), - [LINUX_OBJ=$with_linux_obj], - [LINUX_OBJ=$LINUX]) - AC_MSG_RESULT([$LINUX_OBJ]) - AC_SUBST(LINUX_OBJ) - - # -------- check for .confg -------- - AC_ARG_WITH([linux-config], - [AC_HELP_STRING([--with-linux-config=path], - [set path to Linux .conf (default=\$LINUX_OBJ/.config)])], - [LINUX_CONFIG=$with_linux_config], - [LINUX_CONFIG=$LINUX_OBJ/.config]) - AC_SUBST(LINUX_CONFIG) - - AC_CHECK_FILE([/boot/kernel.h], - [KERNEL_SOURCE_HEADER='/boot/kernel.h'], - [AC_CHECK_FILE([/var/adm/running-kernel.h]), - [KERNEL_SOURCE_HEADER='/var/adm/running-kernel.h']]) - - AC_ARG_WITH([kernel-source-header], - AC_HELP_STRING([--with-kernel-source-header=path], - [Use a different kernel version header. Consult README.kernel-source for details.]), - [KERNEL_SOURCE_HEADER=$with_kernel_source_header]) - - # -------------------- - ARCH_UM= - UML_CFLAGS= - - AC_MSG_CHECKING([if you are running user mode linux for $host_cpu]) - if test -e $LINUX/include/asm-um ; then - if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then - ARCH_UM='ARCH=um' - # see notes in Rules.in - UML_CFLAGS='-O0' - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT([no (asm doesn't point at asm-um)]) - fi - else - AC_MSG_RESULT([no (asm-um missing)]) - fi - - AC_SUBST(ARCH_UM) - AC_SUBST(UML_CFLAGS) - - # --------- Linux 25 ------------------ - AC_CHECK_FILE([$LINUX/include/linux/namei.h], - [ - linux25="yes" - KMODEXT=".ko" - enable_ldiskfs="yes" - BACKINGFS="ldiskfs" - ],[ - KMODEXT=".o" - linux25="no" - ]) - AC_MSG_CHECKING([if you are using Linux 2.6]) - AC_MSG_RESULT([$linux25]) - - AC_SUBST(LINUX25) - AC_SUBST(KMODEXT) - - AC_PATH_PROG(PATCH, patch, [no]) - AC_PATH_PROG(QUILT, quilt, [no]) - - if test x$enable_ldiskfs$PATCH$QUILT = xyesnono ; then - AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)]) - fi -fi -AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) -AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) - -# ------- Makeflags ------------------ - -CPPFLAGS="$CPPFLAGS $CRAY_PORTALS_INCLUDES -I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include" - -# liblustre are all the same -LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1" -AC_SUBST(LLCPPFLAGS) - -LLCFLAGS="-g -Wall -fPIC" -AC_SUBST(LLCFLAGS) - -# everyone builds against portals and lustre - -if test x$enable_ldiskfs = xyes ; then - AC_DEFINE(CONFIG_LDISKFS_FS_MODULE, 1, [build ldiskfs as a module]) - AC_DEFINE(CONFIG_LDISKFS_FS_XATTR, 1, [enable extended attributes for ldiskfs]) - AC_DEFINE(CONFIG_LDISKFS_FS_POSIX_ACL, 1, [enable posix acls]) - AC_DEFINE(CONFIG_LDISKFS_FS_SECURITY, 1, [enable fs security]) -fi - -EXTRA_KCFLAGS="-g $CRAY_PORTALS_INCLUDES -I$PWD/portals/include -I$PWD/include" - -# these are like AC_TRY_COMPILE, but try to build modules against the -# kernel, inside the kernel-tests directory - -AC_DEFUN([LUSTRE_MODULE_CONFTEST], -[cat >conftest.c <<_ACEOF -$1 -_ACEOF -]) - -AC_DEFUN([LUSTRE_MODULE_COMPILE_IFELSE], -[m4_ifvaln([$1], [LUSTRE_MODULE_CONFTEST([$1])])dnl -rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.ko -AS_IF([AC_TRY_COMMAND(cp conftest.c kernel-tests && make [$2] CC="$CC" -f $PWD/kernel-tests/Makefile LUSTRE_LINUX_CONFIG=$LINUX_CONFIG -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration $EXTRA_KCFLAGS" $ARCH_UM $MODULE_TARGET=$PWD/kernel-tests) >/dev/null && AC_TRY_COMMAND([$3])], - [$4], - [_AC_MSG_LOG_CONFTEST -m4_ifvaln([$5],[$5])dnl])dnl -rm -f kernel-tests/conftest.o kernel-tests/conftest.mod.c kernel-tests/conftest.mod.o kernel-tests/conftest.ko m4_ifval([$1], [kernel-tests/conftest.c conftest.c])[]dnl -]) - -AC_DEFUN([LUSTRE_MODULE_TRY_COMPILE], -[LUSTRE_MODULE_COMPILE_IFELSE( - [AC_LANG_PROGRAM([[$1]], [[$2]])], - [modules], - [test -s kernel-tests/conftest.o], - [$3], [$4])]) - -AC_DEFUN([LUSTRE_MODULE_TRY_MAKE], -[LUSTRE_MODULE_COMPILE_IFELSE([AC_LANG_PROGRAM([[$1]], [[$2]])], [$3], [$4], [$5], [$6])]) - -# ------------ include paths ------------------ - -if test x$enable_modules != xno ; then - # ------------ .config exists ---------------- - AC_CHECK_FILE([$LINUX_CONFIG],[], - [AC_MSG_ERROR([Kernel config could not be found. If you are building from a kernel-source rpm consult README.kernel-source])]) - - # ----------- make dep run? ------------------ - AC_CHECK_FILES([$LINUX_OBJ/include/linux/autoconf.h - $LINUX_OBJ/include/linux/version.h - $LINUX/include/linux/config.h],[], - [AC_MSG_ERROR([Run make config in $LINUX.])]) - - # ------------ rhconfig.h includes runtime-generated bits -- - # red hat kernel-source checks - - # we know this exists after the check above. if the user - # tarred up the tree and ran make dep etc. in it, then - # version.h gets overwritten with a standard linux one. - - if grep rhconfig $LINUX_OBJ/include/linux/version.h >/dev/null ; then - # This is a clean kernel-source tree, we need to - # enable extensive workarounds to get this to build - # modules - AC_CHECK_FILE([$KERNEL_SOURCE_HEADER], - [if test $KERNEL_SOURCE_HEADER = '/boot/kernel.h' ; then - AC_MSG_WARN([Using /boot/kernel.h from RUNNING kernel.]) - AC_MSG_WARN([If this is not what you want, use --with-kernel-source-header.]) - AC_MSG_WARN([Consult README.kernel-source for details.]) - fi], - [AC_MSG_ERROR([$KERNEL_SOURCE_HEADER not found. Consult README.kernel-source for details.])]) - EXTRA_KCFLAGS="-include $KERNEL_SOURCE_HEADER $EXTRA_KCFLAGS" - fi - - # ------------ external module support --------------------- - MODULE_TARGET="SUBDIRS" - if test $linux25 = 'yes' ; then - makerule="$PWD/kernel-tests" - AC_MSG_CHECKING([for external module build support]) - rm -f kernel-tests/conftest.i - LUSTRE_MODULE_TRY_MAKE([],[], - [$makerule LUSTRE_KERNEL_TEST=conftest.i], - [test -s kernel-tests/conftest.i], - [ - AC_MSG_RESULT([no]) - ],[ - AC_MSG_RESULT([yes]) - makerule="_module_$makerule" - MODULE_TARGET="M" - ]) - else - makerule="_dir_$PWD/kernel-tests" - fi - AC_SUBST(MODULE_TARGET) - - # --- check that we can build modules at all - AC_MSG_CHECKING([that modules can be built]) - LUSTRE_MODULE_TRY_COMPILE([],[], - [ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_WARN([Consult config.log for details.]) - AC_MSG_WARN([If you are trying to build with a kernel-source rpm, consult README.kernel-source]) - AC_MSG_ERROR([Kernel modules could not be built.]) - ]) - - # ------------ LINUXRELEASE and moduledir ------------------ - LINUXRELEASE= - rm -f kernel-tests/conftest.i - AC_MSG_CHECKING([for Linux release]) - LUSTRE_MODULE_TRY_MAKE( - [#include ], - [char *LINUXRELEASE; - LINUXRELEASE=UTS_RELEASE;], - [$makerule LUSTRE_KERNEL_TEST=conftest.i], - [test -s kernel-tests/conftest.i], - [ - # LINUXRELEASE="UTS_RELEASE" - eval $(grep "LINUXRELEASE=" kernel-tests/conftest.i) - ],[ - AC_MSG_RESULT([unknown]) - AC_MSG_ERROR([Could not preprocess test program. Consult config.log for details.]) - ]) - rm -f kernel-tests/conftest.i - if test x$LINUXRELEASE = x ; then - AC_MSG_RESULT([unknown]) - AC_MSG_ERROR([Could not determine Linux release version from linux/version.h.]) - fi - AC_MSG_RESULT([$LINUXRELEASE]) - AC_SUBST(LINUXRELEASE) - - moduledir='/lib/modules/'$LINUXRELEASE/kernel - modulefsdir='$(moduledir)/fs/$(PACKAGE)' - modulenetdir='$(moduledir)/net/$(PACKAGE)' - - AC_SUBST(moduledir) - AC_SUBST(modulefsdir) - AC_SUBST(modulenetdir) - - # ------------ RELEASE -------------------------------- - AC_MSG_CHECKING([for Lustre release]) - RELEASE="`echo ${LINUXRELEASE} | tr '-' '_'`_`date +%Y%m%d%H%M`" - AC_MSG_RESULT($RELEASE) - AC_SUBST(RELEASE) - - # ---------- Portals flags -------------------- - - AC_MSG_CHECKING([for zero-copy TCP support]) - AC_ARG_ENABLE([zerocopy], - AC_HELP_STRING([--disable-zerocopy], - [disable socknal zerocopy]), - [],[enable_zerocopy='yes']) - if test x$enable_zerocopy = xno ; then - AC_MSG_RESULT([no (by request)]) - else - ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`" - if test "$ZCCD" != 0 ; then - AC_DEFINE(SOCKNAL_ZC, 1, [use zero-copy TCP]) - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT([no (no kernel support)]) - fi - fi - - AC_ARG_ENABLE([affinity], - AC_HELP_STRING([--disable-affinity], - [disable process/irq affinity]), - [],[enable_affinity='yes']) - - AC_MSG_CHECKING([for CPU affinity support]) - if test x$enable_affinity = xno ; then - AC_MSG_RESULT([no (by request)]) - else - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - struct task_struct t; - #ifdef CPU_ARRAY_SIZE - cpumask_t m; - #else - unsigned long m; - #endif - set_cpus_allowed(&t, m); - ],[ - AC_DEFINE(CPU_AFFINITY, 1, [kernel has cpu affinity support]) - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no (no kernel support)]) - ]) - fi - - ##################################### - - AC_MSG_CHECKING([if quadrics kernel headers are present]) - if test -d $LINUX/drivers/net/qsnet ; then - AC_MSG_RESULT([yes]) - QSWNAL="qswnal" - AC_MSG_CHECKING([for multirail EKC]) - if test -f $LINUX/include/elan/epcomms.h; then - AC_MSG_RESULT([supported]) - QSWCPPFLAGS="-DMULTIRAIL_EKC=1" - else - AC_MSG_RESULT([not supported]) - if test -d $LINUX/drivers/net/qsnet/include; then - QSWCPPFLAGS="-I$LINUX/drivers/net/qsnet/include" - else - QSWCPPFLAGS="-I$LINUX/include/linux" - fi - fi - else - AC_MSG_RESULT([no]) - QSWNAL="" - QSWCPPFLAGS="" - fi - AC_SUBST(QSWCPPFLAGS) - AC_SUBST(QSWNAL) - - AC_MSG_CHECKING([if gm support was requested]) - AC_ARG_WITH([gm], - AC_HELP_STRING([--with-gm=path], - [build gmnal against path]), - [ - case $with_gm in - yes) - AC_MSG_RESULT([yes]) - GMCPPFLAGS="-I/usr/local/gm/include" - GMNAL="gmnal" - ;; - no) - AC_MSG_RESULT([no]) - GMCPPFLAGS="" - GMNAL="" - ;; - *) - AC_MSG_RESULT([yes]) - GMCPPFLAGS="-I$with_gm/include -I$with_gm/drivers -I$with_gm/drivers/linux/gm" - GMNAL="gmnal" - ;; - esac - ],[ - AC_MSG_RESULT([no]) - GMCPPFLAGS="" - GMNAL="" - ]) - AC_SUBST(GMCPPFLAGS) - AC_SUBST(GMNAL) - - if test $linux25 = 'no' ; then - #### OpenIB - AC_MSG_CHECKING([if OpenIB kernel headers are present]) - OPENIBCPPFLAGS="-I$LINUX/drivers/infiniband/include -DIN_TREE_BUILD" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS" - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - struct ib_device_properties props; - return 0; - ],[ - AC_MSG_RESULT([yes]) - OPENIBNAL="openibnal" - ],[ - AC_MSG_RESULT([no]) - OPENIBNAL="" - OPENIBCPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" - AC_SUBST(OPENIBCPPFLAGS) - AC_SUBST(OPENIBNAL) - fi - - #### Infinicon IB - AC_MSG_CHECKING([if Infinicon IB kernel headers are present]) - # for how the only infinicon ib build has headers in /usr/include/iba - IIBCPPFLAGS="-I/usr/include -DIN_TREE_BUILD" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $IIBCPPFLAGS" - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - IBT_INTERFACE_UNION interfaces; - FSTATUS rc; - - rc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, - &interfaces); - - return rc == FSUCCESS ? 0 : 1; - ],[ - AC_MSG_RESULT([yes]) - IIBNAL="iibnal" - ],[ - AC_MSG_RESULT([no]) - IIBNAL="" - IIBCPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" - AC_SUBST(IIBCPPFLAGS) - AC_SUBST(IIBNAL) - - #### Rapid Array - AC_MSG_CHECKING([if RapidArray kernel headers are present]) - # placeholder - RACPPFLAGS="-I/tmp" - EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="$EXTRA_KCFLAGS $RACPPFLAGS" - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - #include - ],[ - RAP_RETURN rc; - RAP_PVOID dev_handle; - - rc = RapkGetDeviceByIndex(0, NULL, &dev_handle); - - return rc == RAP_SUCCESS ? 0 : 1; - ],[ - AC_MSG_RESULT([yes]) - RANAL="ranal" - ],[ - AC_MSG_RESULT([no]) - RANAL="" - RACPPFLAGS="" - ]) - EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" - AC_SUBST(RACPPFLAGS) - AC_SUBST(RANAL) - - # ---------- Red Hat 2.4.18 has iobuf->dovary -------------- - # But other kernels don't - - AC_MSG_CHECKING([if struct kiobuf has a dovary field]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - struct kiobuf iobuf; - iobuf.dovary = 1; - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_KIOBUF_DOVARY, 1, [struct kiobuf has a dovary field]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ----------- 2.6.4 no longer has page->list --------------- - AC_MSG_CHECKING([if struct page has a list field]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - struct page page; - &page.list; - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_PAGE_LIST, 1, [struct page has a list field]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- - # This needs to run after we've defined the KCPPFLAGS - - AC_MSG_CHECKING([if task_struct has a sighand field]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - struct task_struct p; - p.sighand = NULL; - ],[ - AC_DEFINE(CONFIG_RH_2_4_20, 1, [this kernel contains Red Hat 2.4.20 patches]) - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ---------- 2.4.20 introduced cond_resched -------------- - - AC_MSG_CHECKING([if kernel offers cond_resched]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - cond_resched(); - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # --------- zap_page_range(vma) -------------------------------- - AC_MSG_CHECKING([if zap_pag_range with vma parameter]) - ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`" - if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then - AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter]) - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi - - # ---------- Red Hat 2.4.21 backports some more 2.5 bits -------- - - AC_MSG_CHECKING([if kernel defines PDE]) - HAVE_PDE="`grep -c 'proc_dir_entry..PDE' $LINUX/include/linux/proc_fs.h`" - if test "$HAVE_PDE" != 0 ; then - AC_DEFINE(HAVE_PDE, 1, [the kernel defines PDE]) - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi - - AC_MSG_CHECKING([if kernel passes struct file to direct_IO]) - HAVE_DIO_FILE="`grep -c 'direct_IO.*struct file' $LINUX/include/linux/fs.h`" - if test "$HAVE_DIO_FILE" != 0 ; then - AC_DEFINE(HAVE_DIO_FILE, 1, [the kernel passes struct file to direct_IO]) - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT(no) - fi - - AC_MSG_CHECKING([if kernel defines cpu_online()]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - cpu_online(0); - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_CPU_ONLINE, 1, [cpu_online found]) - ],[ - AC_MSG_RESULT([no]) - ]) - AC_MSG_CHECKING([if kernel defines cpumask_t]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - return sizeof (cpumask_t); - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_CPUMASK_T, 1, [cpumask_t found]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ---------- RHEL kernels define page_count in mm_inline.h - AC_MSG_CHECKING([if kernel has mm_inline.h header]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef page_count - #error mm_inline.h does not define page_count - #endif - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_MM_INLINE, 1, [mm_inline found]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ---------- inode->i_alloc_sem -------------- - AC_MSG_CHECKING([if struct inode has i_alloc_sem]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - #include - ],[ - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,24)) - #error "down_read_trylock broken before 2.4.24" - #endif - struct inode i; - return (char *)&i.i_alloc_sem - (char *)&i; - ],[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_I_ALLOC_SEM, 1, [struct inode has i_alloc_sem]) - ],[ - AC_MSG_RESULT([no]) - ]) - - - # ---------- modules? ------------------------ - AC_MSG_CHECKING([for module support]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_MODULES - #error CONFIG_MODULES not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_ERROR([module support is required to build Lustre kernel modules.]) - ]) - - # ---------- modversions? -------------------- - AC_MSG_CHECKING([for MODVERSIONS]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_MODVERSIONS - #error CONFIG_MODVERSIONS not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ------------ preempt ----------------------- - AC_MSG_CHECKING([if preempt is enabled]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_PREEMPT - #error CONFIG_PREEMPT is not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - AC_MSG_ERROR([Lustre does not support kernels with preempt enabled.]) - ],[ - AC_MSG_RESULT([no]) - ]) - - # ------------ kallsyms (so software watchdogs produce useful stacks) - AC_MSG_CHECKING([if kallsyms is enabled]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_KALLSYMS - #error CONFIG_KALLSYMS is not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - if test "x$ARCH_UM" = "x" ; then - AC_MSG_ERROR([Lustre requires that CONFIG_KALLSYMS is enabled in your kernel.]) - fi - ]) - - # ------------ check for our show_task patch - AC_MSG_CHECKING([if kernel exports show_task]) - have_show_task=0 - for file in ksyms sched ; do - if grep -q "EXPORT_SYMBOL(show_task)" \ - "$LINUX/kernel/$file.c" 2>/dev/null ; then - have_show_task=1 - break - fi - done - if test x$have_show_task = x1 ; then - AC_DEFINE(HAVE_SHOW_TASK, 1, [show_task is exported]) - AC_MSG_RESULT(yes) - else - AC_MSG_RESULT(no) - fi - - case $BACKINGFS in - ext3) - # --- Check that ext3 and ext3 xattr are enabled in the kernel - AC_MSG_CHECKING([that ext3 is enabled in the kernel]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_EXT3_FS - #ifndef CONFIG_EXT3_FS_MODULE - #error CONFIG_EXT3_FS not #defined - #endif - #endif - ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_ERROR([Lustre requires that ext3 is enabled in the kernel (CONFIG_EXT3_FS)]) - ]) - - AC_MSG_CHECKING([that extended attributes for ext3 are enabled in the kernel]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[ - #ifndef CONFIG_EXT3_FS_XATTR - #error CONFIG_EXT3_FS_XATTR not #defined - #endif - ],[ - AC_MSG_RESULT([yes]) - ],[ - AC_MSG_RESULT([no]) - AC_MSG_WARN([Lustre requires that extended attributes for ext3 are enabled in the kernel (CONFIG_EXT3_FS_XATTR.)]) - AC_MSG_WARN([This build may fail.]) - ]) - ;; - ldiskfs) - AC_MSG_CHECKING([if fshooks are present]) - LUSTRE_MODULE_TRY_COMPILE( - [ - #include - ],[],[ - AC_MSG_RESULT([yes]) - LDISKFS_SERIES="2.6-suse.series" - ],[ - AC_MSG_RESULT([no]) - LDISKFS_SERIES="2.6-vanilla.series" - ]) - AC_SUBST(LDISKFS_SERIES) - # --- check which ldiskfs series we should use - ;; - esac # $BACKINGFS -fi - -AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal") -AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") -AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") -AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") -AM_CONDITIONAL(BUILD_RANAL, test x$RANAL = "xranal") - -# portals/utils/portals.c -AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h]) -AC_CHECK_FUNCS([gethostbyname socket connect]) - -# portals/utils/debug.c -AC_CHECK_HEADERS([linux/version.h]) - -# include/liblustre.h -AC_CHECK_HEADERS([asm/page.h sys/user.h stdint.h]) - -# liblustre/llite_lib.h -AC_CHECK_HEADERS([xtio.h file.h]) - -# liblustre/dir.c -AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h]) - -# liblustre/lutil.c -AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h]) -AC_CHECK_FUNCS([inet_ntoa]) - -CPPFLAGS="-include \$(top_builddir)/include/config.h $CPPFLAGS" -EXTRA_KCFLAGS="-include $PWD/include/config.h $EXTRA_KCFLAGS" -AC_SUBST(EXTRA_KCFLAGS) - -echo "CPPFLAGS: $CPPFLAGS" -echo "LLCPPFLAGS: $LLCPPFLAGS" -echo "CFLAGS: $CFLAGS" -echo "EXTRA_KCFLAGS: $EXTRA_KCFLAGS" -echo "LLCFLAGS: $LLCFLAGS" - -ENABLE_INIT_SCRIPTS=0 -if test x$enable_utils = xyes ; then - AC_MSG_CHECKING([whether to install init scripts]) - # our scripts only work on red hat systems - if test -f /etc/init.d/functions -a -f /etc/sysconfig/network ; then - ENABLE_INIT_SCRIPTS=1 - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi -fi -AM_CONDITIONAL(INIT_SCRIPTS, test x$ENABLE_INIT_SCRIPTS = "x1") -AC_SUBST(ENABLE_INIT_SCRIPTS) diff --git a/lustre/portals/autoMakefile.am b/lustre/portals/autoMakefile.am deleted file mode 100644 index 485ff04..0000000 --- a/lustre/portals/autoMakefile.am +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -EXTRA_DIST = archdep.m4 build.m4 - -SUBDIRS = portals libcfs knals unals router tests doc utils include diff --git a/lustre/portals/autogen.sh b/lustre/portals/autogen.sh deleted file mode 100755 index 9deed73..0000000 --- a/lustre/portals/autogen.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -aclocal && -automake --add-missing && -${AUTOCONF:-autoconf} diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4 deleted file mode 100644 index f158396..0000000 --- a/lustre/portals/build.m4 +++ /dev/null @@ -1,127 +0,0 @@ -# ---------- other tests and settings --------- - -AC_CHECK_TYPE([spinlock_t], - [AC_DEFINE(HAVE_SPINLOCK_T, 1, [spinlock_t is defined])], - [], - [#include ]) - -# --------- unsigned long long sane? ------- - -AC_CHECK_SIZEOF(unsigned long long, 0) -echo "---> size SIZEOF $SIZEOF_unsigned_long_long" -echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" -if test $ac_cv_sizeof_unsigned_long_long != 8 ; then - AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) -fi - -# directories for binaries -ac_default_prefix=/usr - -# mount.lustre -rootsbindir='/sbin' -AC_SUBST(rootsbindir) -sysconfdir='/etc' -AC_SUBST(sysconfdir) -# Directories for documentation and demos. -docdir='${datadir}/doc/$(PACKAGE)' -AC_SUBST(docdir) -demodir='$(docdir)/demo' -AC_SUBST(demodir) -pkgexampledir='${pkgdatadir}/examples' -AC_SUBST(pkgexampledir) -pymoddir='${pkglibdir}/python/Lustre' -AC_SUBST(pymoddir) - -# ---------- BAD gcc? ------------ -AC_PROG_RANLIB -AC_PROG_CC -AC_MSG_CHECKING([for buggy compiler]) -CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` -bad_cc() { - AC_MSG_RESULT([buggy compiler found!]) - echo - echo " '$CC_VERSION'" - echo " has been known to generate bad code, " - echo " please get an updated compiler." - AC_MSG_ERROR([sorry]) -} -TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` -if test "$TMP_VERSION" = "gcc version 2.95"; then - bad_cc -fi -case "$CC_VERSION" in - # ost_pack_niobuf putting 64bit NTOH temporaries on the stack - # without "sub $0xc,%esp" to protect the stack from being - # stomped on by interrupts (bug 606) - "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") - bad_cc - ;; - # mandrake's similar sub 0xc compiler bug - # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2 - "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") - bad_cc - ;; - # unpatched 'gcc' on rh9. miscompiles a - # struct = (type) { .member = value, }; - # asignment in the iibnal where the struct is a mix - # of u64 and u32 bit-fields. - "gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)") - bad_cc - ;; - *) - AC_MSG_RESULT([no known problems]) - ;; -esac -# end ------ BAD gcc? ------------ - -# -------- Check for required packages -------------- - -# this doesn't seem to work on older autoconf -# AC_CHECK_LIB(readline, readline,,) -AC_MSG_CHECKING([for readline support]) -AC_ARG_ENABLE(readline, - AC_HELP_STRING([--disable-readline], - [do not use readline library]), - [],[enable_readline='yes']) -AC_MSG_RESULT([$enable_readline]) -if test x$enable_readline = xyes ; then - LIBREADLINE="-lreadline -lncurses" - AC_DEFINE(HAVE_LIBREADLINE, 1, [readline library is available]) -else - LIBREADLINE="" -fi -AC_SUBST(LIBREADLINE) - -AC_MSG_CHECKING([if efence debugging support is requested]) -AC_ARG_ENABLE(efence, - AC_HELP_STRING([--enable-efence], - [use efence library]), - [],[enable_efence='no']) -AC_MSG_RESULT([$enable_efence]) -if test "$enable_efence" = "yes" ; then - LIBEFENCE="-lefence" - AC_DEFINE(HAVE_LIBEFENCE, 1, [libefence support is requested]) -else - LIBEFENCE="" -fi -AC_SUBST(LIBEFENCE) - -# -------- enable acceptor libwrap (TCP wrappers) support? ------- -AC_MSG_CHECKING([if libwrap support is requested]) -AC_ARG_ENABLE([libwrap], - AC_HELP_STRING([--enable-libwrap], [use TCP wrappers]), - [case "${enableval}" in - yes) enable_libwrap=yes ;; - no) enable_libwrap=no ;; - *) AC_MSG_ERROR(bad value ${enableval} for --enable-libwrap) ;; - esac],[enable_libwrap=no]) -AC_MSG_RESULT([$enable_libwrap]) -if test x$enable_libwrap = xyes ; then - LIBWRAP="-lwrap" - AC_DEFINE(HAVE_LIBWRAP, 1, [libwrap support is requested]) -else - LIBWRAP="" -fi -AC_SUBST(LIBWRAP) - -AC_SUBST(LIBS) diff --git a/lustre/portals/doc/.cvsignore b/lustre/portals/doc/.cvsignore deleted file mode 100644 index 827dca4..0000000 --- a/lustre/portals/doc/.cvsignore +++ /dev/null @@ -1,4 +0,0 @@ -Makefile -Makefile.in -*.eps -*.pdf diff --git a/lustre/portals/doc/Data-structures b/lustre/portals/doc/Data-structures deleted file mode 100644 index b5532b1..0000000 --- a/lustre/portals/doc/Data-structures +++ /dev/null @@ -1,65 +0,0 @@ -In this document I will try to draw the data structures and how they -interrelate in the Portals 3 reference implementation. It is probably -best shown with a drawing, so there may be an additional xfig or -Postscript figure. - - -MEMORY POOLS: ------------- - -First, a digression on memory allocation in the library. As mentioned -in the NAL Writer's Guide, the library does not link against any -standard C libraries and as such is unable to dynamically allocate -memory on its own. It requires that the NAL implement a method -for allocation that is appropriate for the protection domain in -which the library lives. This is only called when a network -interface is initialized to allocate the Portals object pools. - -These pools are preallocate blocks of objects that the library -can rapidly make active and manage with a minimum of overhead. -It is also cuts down on overhead for setting up structures -since the NAL->malloc() callback does not need to be called -for each object. - -The objects are maintained on a per-object type singly linked free -list and contain a pointer to the next free object. This pointer -is NULL if the object is not on the free list and is non-zero -if it is on the list. The special sentinal value of 0xDEADBEEF -is used to mark the end of the free list since NULL could -indicate that the last object in the list is not free. - -When one of the lib_*_alloc() functions is called, the library -returns the head of the free list and advances the head pointer -to the next item on the list. The special case of 0xDEADBEEF is -checked and a NULL pointer is returned if there are no more -objects of this type available. The lib_*_free() functions -are even simpler -- check to ensure that the object is not already -free, set its next pointer to the current head and then set -the head to be this newly freed object. - -Since C does not have templates, I did the next best thing and wrote -the memory pool allocation code as a macro that expands based on the -type of the argument. The mk_alloc(T) macro expands to -write the _lib_T_alloc() and lib_T_free() functions. -It requires that the object have a pointer of the type T named -"next_free". There are also functions that map _lib_T_alloc() -to lib_T_alloc() so that the library can add some extra -functionality to the T constructor. - - - -LINKED LISTS: ------------- - -Many of the active Portals objects are stored in doubly linked lists -when they are active. These are always implemented with the pointer -to the next object and a pointer to the next pointer of the -previous object. This avoids the "dummy head" object or -special cases for inserting at the beginning or end of the list. -The pointer manipulations are a little hairy at times, but -I hope that they are understandable. - -The actual linked list code is implemented as macros in , -although the object has to know about - - diff --git a/lustre/portals/doc/Makefile.am b/lustre/portals/doc/Makefile.am deleted file mode 100644 index b7f6252..0000000 --- a/lustre/portals/doc/Makefile.am +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -LYX2PDF = lyx --export pdf -LYX2TXT = lyx --export text -LYX2HTML = lyx --export html -SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps - -if DOC - DOCS = portals3.pdf -else - DOCS = -endif - -IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps -LYXFILES= portals3.lyx - -MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(GENERATED) -GENERATED = -EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) - -all: $(DOCS) - -# update date and version in document -date := $(shell date +%x) -tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') -addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g' - -# Regenerate when the $(VERSION) or $Name: $ changes. -.INTERMEDIATE: $(GENERATED) -$(GENERATED) : %.lyx: %.lin Makefile - $(addversion) $< > $@ - -.lyx.pdf: - @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" - -.lyx.txt: - @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" -.lyx.html: - @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" -.fig.eps: - -fig2dev -L eps $< > $@ - -portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx - -syncweb: portals3.pdf -# cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf -# ( cd /usr/src/www ; make lustre ; make synclustre ) - diff --git a/lustre/portals/doc/Message-life-cycle b/lustre/portals/doc/Message-life-cycle deleted file mode 100644 index e8cc7e2..0000000 --- a/lustre/portals/doc/Message-life-cycle +++ /dev/null @@ -1,118 +0,0 @@ -This documents the life cycle of message as it arrives and is handled by -a basic async, packetized NAL. There are four types of messages that have -slightly different life cycles, so they are addressed independently. - - -Put request ------------ - -1. NAL notices that there is a incoming message header on the network -and reads an ptl_hdr_t in from the wire. - -2. It may store additional NAL specific data that provides context -for this event in a void* that it will interpret in some fashion -later. - -3. The NAL calls lib_parse() with a pointer to the header and its -private data structure. - -4. The library decodes the header and may build a message state -object that describes the event to be written and the ACK to be -sent, if any. It then calls nal->recv() with the private data -that the NAL passed in, a pointer to the message state object -and a translated user address. - - The NAL will have been given a chance to pretranslate - all user addresses when the buffers are created. This - process is described in the NAL-HOWTO. - -5. The NAL should restore what ever context it required from the -private data pointer, begin receiving the bytes and possibly store -some extra state of its own. It should return at this point. - - - -Get request ------------ - -1. As with a Put, the NAL notices the incoming message header and -passes it to lib_parse(). - -2. The library decodes the header and calls nal->recv() with a -zero byte length, offset and destination to instruct it to clean -up the wire after reading the header. The private data will -be passed in as well, allowing the NAL to retrieve any state -or context that it requires. - -3. The library may build a message state object to possibly -write an event log or invalidate a memory region. - -4. The library will build a ptl_msg_t header that specifies the -Portals protocol information for delivery at the remote end. - -5. The library calls nal->send() with the pre-built header, -the optional message state object, the four part address -component, a translated user pointer + offset, and some -other things. - -6. The NAL is to put the header on the wire or copy it at -this point (since it off the stack). It should store some -amount of state about its current position in the message and -the destination address. - -7. And then return to the library. - - -Reply request -------------- - -1. Starting at "The library decodes the header..." - -2. The library decodes the header and calls nal->recv() -to bring in the rest of the message. Flow continues in -exactly the same fashion as with all other receives. - - -Ack request ------------ - -1. The library decodes the header, builds the appropriate data -structures for the event in a message state object and calls nal->recv() -with a zero byte length, etc. - - -Packet arrival --------------- - -1. The NAL should notice the arrival of a packet, retrieve whatever -state it needs from the message ID or other NAL specific header data -and place the data bytes directly into the user address that were -given to nal->recv(). - - How this happens is outside the scope of the Portals library - and soley determined by the NAL... - -2. If this is the last packet in a message, the NAL should retrieve -the lib_msg_t *cookie that it was given in the call to nal->recv() -and pass it to lib_finalize(). lib_finalize() may call nal->send() -to send an ACK, nal->write() to record an entry in the event log, -nal->invalidate() to unregister a region of memory or do nothing at all. - -3. It should then clean up any remaining NAL specific state about -the message and go back into the main loop. - - -Outgoing packets ----------------- - -1. When the NAL has pending output, it should put the packets on -the wire wrapped with whatever implementation specified wrappers. - -2. Once it has output all the packets of a message it should -call lib_finalize() with the message state object that was -handed to nal->send(). This will allows the library to clean -up its state regarding the message and write any pending event -entries. - - - diff --git a/lustre/portals/doc/NAL-HOWTO b/lustre/portals/doc/NAL-HOWTO deleted file mode 100644 index ea38aed..0000000 --- a/lustre/portals/doc/NAL-HOWTO +++ /dev/null @@ -1,293 +0,0 @@ -This document is a first attempt at describing how to write a NAL -for the Portals 3 library. It also defines the library architecture -and the abstraction of protection domains. - - -First, an overview of the architecture: - - Application - -----|----+-------- - | - API === NAL (User space) - | ----------+---|----- - | - LIB === NAL (Library space) - | ----------+---|----- - - Physical wire (NIC space) - - -Application - API -API-side NAL ------------- -LIB-side NAL - LIB -LIB-side NAL - wire - -Communication is through the indicated paths via well defined -interfaces. The API and LIB portions are written to be portable -across platforms and do not depend on the network interface. - -Communcation between the application and the API code is -defined in the Portals 3 API specification. This is the -user-visible portion of the interface and should be the most -stable. - - - -API-side NAL: ------------- - -The user space NAL needs to implement only a few functions -that are stored in a nal_t data structure and called by the -API-side library: - - int forward( nal_t *nal, - int index, - void *args, - size_t arg_len, - void *ret, - size_t ret_len - ); - -Most of the data structures in the portals library are held in -the LIB section of the code, so it is necessary to forward API -calls across the protection domain to the library. This is -handled by the NAL's forward method. Once the argument and return -blocks are on the remote side the NAL should call lib_dispatch() -to invoke the appropriate API function. - - int validate( nal_t *nal, - void *base, - size_t extent, - void **trans_base, - void **trans_data - ); - -The validate method provides a means for the NAL to prevalidate -and possibly pretranslate user addresses into a form suitable -for fast use by the network card or kernel module. The trans_base -pointer will be used by the library everytime it needs to -refer to the block of memory. The trans_data result is a -cookie that will be handed to the NAL along with the trans_base. - -The library never performs calculations on the trans_base value; -it only computes offsets that are then handed to the NAL. - - - int shutdown( nal_t *nal, int interface ); - -Brings down the network interface. The remote NAL side should -call lib_fini() to bring down the library side of the network. - - void yield( nal_t *nal ); - -This allows the user application to gracefully give up the processor -while busy waiting. Performance critical applications may not -want to take the time to call this function, so it should be an -option to the PtlEQWait call. Right now it is not implemented as such. - -Lastly, the NAL must implement a function named PTL_IFACE_*, where -* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR. -This initialization function is to set up communication with the -library-side NAL, which should call lib_init() to bring up the -network interface. - - - -LIB-side NAL: ------------- - -On the library-side, the NAL has much more responsibility. It -is responsible for calling lib_dispatch() on behalf of the user, -it is also responsible for bringing packets off the wire and -pushing bits out. As on the user side, the methods are stored -in a nal_cb_t structure that is defined on a per network -interface basis. - -The calls to lib_dispatch() need to be examined. The prototype: - - void lib_dispatch( - nal_cb_t *nal, - void *private, - int index, - void *arg_block, - void *ret_block - ); - -has two complications. The private field is a NAL-specific -value that will be passed to any callbacks produced as a result -of this API call. Kernel module implementations may use this -for task structures, or perhaps network card data. It is ignored -by the library. - -Secondly, the arg_block and ret_block must be in the same protection -domain as the library. The NAL's two halves must communicate the -sizes and perform the copies. After the call, the buffer pointed -to by ret_block will be filled in and should be copied back to -the user space. How this is to be done is NAL specific. - - int lib_parse( - nal_cb_t *nal, - ptl_hdr_t *hdr, - void *private - ); - -This is the only other entry point into the library from the NAL. -When the NAL detects an incoming message on the wire it should read -sizeof(ptl_hdr_t) bytes and pass a pointer to the header to -lib_parse(). It may set private to be anything that it needs to -tie the incoming message to callbacks that are made as a result -of this event. - -The method calls are: - - int (*send)( - nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int nid, - int pid, - int gid, - int rid, - user_ptr trans_base, - user_ptr trans_data, - size_t offset, - size_t len - ); - -This is a tricky function -- it must support async output -of messages as well as properly syncronized event log writing. -The private field is the same that was passed into lib_dispatch() -or lib_parse() and may be used to tie this call to the event -that initiated the entry to the library. - -The cookie is a pointer to a library private value that must -be passed to lib_finalize() once the message has been completely -sent. It should not be examined by the NAL for any meaning. - -The four ID fields are passed in, although some implementations -may not use all of them. - -The single base pointer has been replaced with the translated -address that the API NAL generated in the api_nal->validate() -call. The trans_data is unchanged and the offset is in bytes. - - - int (*recv)( - nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - user_ptr trans_base, - user_ptr trans_data, - size_t offset, - size_t mlen, - size_t rlen - ); - -This callback will only be called in response to lib_parse(). -The cookie, trans_addr and trans_data are as discussed in send(). -The NAL should read mlen bytes from the wire, deposit them into -trans_base + offset and then discard (rlen - mlen) bytes. -Once the entire message has been received the NAL should call -lib_finalize() with the lib_msg_t *cookie. - -The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0 -is used to indicate that the NAL should clean up the wire. This could -be implemented as a blocking call, although having it return as quickly -as possible is desirable. - - int (*write)( - nal_cb_t *nal, - void *private, - user_ptr trans_addr, - user_ptr trans_data, - size_t offset, - - void *src_addr, - size_t len - ); - -This is essentially a cross-protection domain memcpy(). The user address -has been pretranslated by the api_nal->translate() call. - - void *(*malloc)( - nal_cb_t *nal, - size_t len - ); - - void (*free)( - nal_cb_t *nal, - void *buf - ); - -Since the NAL may be in a non-standard hosted environment it can -not call malloc(). This allows the library side NAL to implement -the system specific malloc(). In the current reference implementation -the libary only calls nal->malloc() when the network interface is -initialized and then calls free when it is brought down. The library -maintains its own pool of objects for allocation so only one call to -malloc is made per object type. - - void (*invalidate)( - nal_cb_t *nal, - user_ptr trans_base, - user_ptr trans_data, - size_t extent - ); - -User addresses are validated/translated at the user-level API NAL -method, which is likely to push them to this level. Meanwhile, -the library NAL will be notified when the library no longer -needs the buffer. Overlapped buffers are not detected by the -library, so the NAL should ref count each page involved. - -Unfortunately we have a few bugs when the invalidate method is -called. It is still in progress... - - void (*printf)( - nal_cb_t *nal, - const char *fmt, - ... - ); - -As with malloc(), the library does not have any way to do printf -or printk. It is not necessary for the NAL to implement the this -call, although it will make debugging difficult. - - void (*cli)( - nal_cb_t *nal, - unsigned long *flags - ); - - void (*sti)( - nal_cb_t *nal, - unsigned long *flags - ); - -These are used by the library to mark critical sections. - - int (*gidrid2nidpid)( - nal_cb_t *nal, - ptl_id_t gid, - ptl_id_t rid, - ptl_id_t *nid, - ptl_id_t *pid - ); - - - int (*nidpid2gidrid)( - nal_cb_t *nal, - ptl_id_t nid, - ptl_id_t pid, - ptl_id_t *gid, - ptl_id_t *rid - ); - -Rolf added these. I haven't looked at how they have to work yet. diff --git a/lustre/portals/doc/file.fig b/lustre/portals/doc/file.fig deleted file mode 100644 index 914c294..0000000 --- a/lustre/portals/doc/file.fig +++ /dev/null @@ -1,111 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -6 1200 750 1650 1050 -2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 - 1650 1050 1650 750 1200 750 1200 1050 1650 1050 -4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001 --6 -6 1200 2325 1650 2625 -2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 - 1650 2625 1650 2325 1200 2325 1200 2625 1650 2625 -4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001 --6 -6 1200 1800 1650 2100 -2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 - 1650 2100 1650 1800 1200 1800 1200 2100 1650 2100 -4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001 --6 -6 1200 1275 1650 1575 -2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 - 1650 1575 1650 1275 1200 1275 1200 1575 1650 1575 -4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001 --6 -6 450 750 900 1200 -5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050 -1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 450 825 450 1050 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 1050 900 825 --6 -6 450 2325 900 2775 -5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625 -1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 450 2400 450 2625 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 2625 900 2400 --6 -6 450 1800 900 2250 -5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100 -1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 450 1875 450 2100 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 2100 900 1875 --6 -6 450 1275 900 1725 -5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575 -1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 450 1350 450 1575 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 1575 900 1350 --6 -6 2250 750 3450 2625 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 - 2550 1200 3150 1200 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 - 2550 1500 3150 1500 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 - 2550 1800 3150 1800 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 - 2550 2100 3150 2100 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2550 975 3150 975 3150 2625 2550 2625 2550 975 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 - 2550 2400 3150 2400 -4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001 --6 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 2400 2550 1350 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 1875 2550 1050 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 1425 2550 1950 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 900 2550 1650 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 900 1200 900 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 1425 1200 1425 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 1950 1200 1950 -2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 - 900 2475 1200 2475 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 2025 2550 2250 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1650 2550 2550 2475 -2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 - 1875 2850 1875 600 225 600 225 2850 1875 2850 -4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001 diff --git a/lustre/portals/doc/flow_new.fig b/lustre/portals/doc/flow_new.fig deleted file mode 100644 index d828dea..0000000 --- a/lustre/portals/doc/flow_new.fig +++ /dev/null @@ -1,213 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -6 525 2175 1575 2925 -6 675 2287 1425 2812 -4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001 -4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001 -4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001 --6 -2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 - 1575 2550 1050 2175 525 2550 1050 2925 1575 2550 --6 -6 3450 1275 4350 1725 -6 3600 1312 4200 1687 -4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001 -4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 3450 1275 4350 1275 4350 1725 3450 1725 3450 1275 --6 -6 4650 1275 5550 1725 -6 4725 1312 5475 1687 -4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001 -4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 4650 1275 5550 1275 5550 1725 4650 1725 4650 1275 --6 -6 1350 525 2250 975 -6 1350 562 2250 937 -4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001 -4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1350 525 2250 525 2250 975 1350 975 1350 525 --6 -6 525 1125 1575 1875 -2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 - 1575 1500 1050 1125 525 1500 1050 1875 1575 1500 -4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001 --6 -6 2340 1237 2940 1687 -6 2340 1237 2940 1687 -4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001 -4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001 -4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001 --6 --6 -6 525 3225 1575 3975 -6 675 3375 1425 3750 -4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001 -4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001 --6 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 - 525 3600 1050 3225 1575 3600 1050 3975 525 3600 --6 -6 3300 3375 4350 3825 -6 3300 3412 4350 3787 -4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001 -4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 3300 3375 4350 3375 4350 3825 3300 3825 3300 3375 --6 -6 1950 3225 3000 3975 -6 2250 3450 2700 3750 -4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001 -4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001 --6 -2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 - 3000 3600 2475 3225 1950 3600 2475 3975 3000 3600 --6 -6 3150 4500 4200 4950 -6 3150 4537 4200 4912 -4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001 -4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 3150 4500 4200 4500 4200 4950 3150 4950 3150 4500 --6 -6 600 4500 1500 4950 -6 675 4537 1425 4912 -4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001 -4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 600 4500 1500 4500 1500 4950 600 4950 600 4500 --6 -6 4650 4350 5700 5100 -6 4950 4537 5400 4912 -6 4950 4537 5400 4912 -4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001 -4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001 --6 --6 -2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 - 5700 4725 5175 4350 4650 4725 5175 5100 5700 4725 --6 -6 6000 4500 6900 4950 -6 6225 4575 6675 4875 -4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001 -4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001 --6 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 6000 4500 6900 4500 6900 4950 6000 4950 6000 4500 --6 -6 1800 4350 2850 5100 -6 2100 4575 2550 4875 -4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001 -4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001 --6 -2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 - 2850 4725 2325 4350 1800 4725 2325 5100 2850 4725 --6 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1050 1875 1050 2175 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1575 1500 2100 1500 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1050 450 1050 1125 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1350 750 1050 750 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1050 2925 1050 3225 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3150 1500 3450 1500 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 4350 1500 4650 1500 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 - 2100 1500 2625 1125 3150 1500 2625 1875 2100 1500 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1575 3600 1950 3600 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1050 3975 1050 4500 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3000 3600 3300 3600 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1500 4725 1800 4725 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 5700 4725 6000 4725 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 2850 4725 3150 4725 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 4200 4725 4650 4725 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 6900 4725 7950 4725 -3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 - 0 0 1.00 60.00 120.00 - 1575 2550 1650 2550 1800 2550 1800 2400 1800 1500 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 - 0 0 1.00 60.00 120.00 - 2250 750 2475 750 2625 750 2625 900 2625 1125 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 - 0 0 1.00 60.00 120.00 - 7500 4725 7500 1650 7500 1500 7350 1500 5550 1500 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 - 0 0 1.00 60.00 120.00 - 2475 3225 2475 2400 2475 2250 2325 2250 1800 2250 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 - 0 0 1.00 60.00 120.00 - 3825 3375 3825 2175 3825 2025 3675 2025 1800 2025 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 - 0 0 1.00 60.00 120.00 - 2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125 - 4425 4275 4425 4725 - 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 - 0 0 1.00 60.00 120.00 - 5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125 - 7275 4275 7275 4725 - 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 -4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001 -4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001 -4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001 -4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001 -4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001 -4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001 -4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001 -4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001 -4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001 -4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001 -4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001 -4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001 -4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001 -4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001 -4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001 -4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001 diff --git a/lustre/portals/doc/get.fig b/lustre/portals/doc/get.fig deleted file mode 100644 index 28db949..0000000 --- a/lustre/portals/doc/get.fig +++ /dev/null @@ -1,33 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -6 2775 900 3525 1200 -4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001 -4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001 --6 -6 1350 1725 2175 2025 -4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001 -4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001 --6 -2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 900 525 2700 750 -2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 2700 825 2700 1275 -2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2 - 0 0 1.00 60.00 120.00 - 2700 1350 900 1950 -2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 - 2400 300 3600 300 3600 2250 2400 2250 2400 300 -2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 - 0 300 1200 300 1200 2250 0 2250 0 300 -4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001 -4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 -4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 diff --git a/lustre/portals/doc/ieee.bst b/lustre/portals/doc/ieee.bst deleted file mode 100644 index 4df7c50..0000000 --- a/lustre/portals/doc/ieee.bst +++ /dev/null @@ -1,1112 +0,0 @@ -% --------------------------------------------------------------- -% -% by Paolo.Ienne@di.epfl.ch -% -% --------------------------------------------------------------- -% -% no guarantee is given that the format corresponds perfectly to -% IEEE 8.5" x 11" Proceedings, but most features should be ok. -% -% --------------------------------------------------------------- -% -% `ieee' from BibTeX standard bibliography style `abbrv' -% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. -% Copyright (C) 1985, all rights reserved. -% Copying of this file is authorized only if either -% (1) you make absolutely no changes to your copy, including name, or -% (2) if you do make changes, you name it something other than -% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. -% This restriction helps ensure that all standard styles are identical. -% The file btxbst.doc has the documentation for this style. - -ENTRY - { address - author - booktitle - chapter - edition - editor - howpublished - institution - journal - key - month - note - number - organization - pages - publisher - school - series - title - type - volume - year - } - {} - { label } - -INTEGERS { output.state before.all mid.sentence after.sentence after.block } - -FUNCTION {init.state.consts} -{ #0 'before.all := - #1 'mid.sentence := - #2 'after.sentence := - #3 'after.block := -} - -STRINGS { s t } - -FUNCTION {output.nonnull} -{ 's := - output.state mid.sentence = - { ", " * write$ } - { output.state after.block = - { add.period$ write$ - newline$ - "\newblock " write$ - } - { output.state before.all = - 'write$ - { add.period$ " " * write$ } - if$ - } - if$ - mid.sentence 'output.state := - } - if$ - s -} - -FUNCTION {output} -{ duplicate$ empty$ - 'pop$ - 'output.nonnull - if$ -} - -FUNCTION {output.check} -{ 't := - duplicate$ empty$ - { pop$ "empty " t * " in " * cite$ * warning$ } - 'output.nonnull - if$ -} - -FUNCTION {output.bibitem} -{ newline$ - "\bibitem{" write$ - cite$ write$ - "}" write$ - newline$ - "" - before.all 'output.state := -} - -FUNCTION {fin.entry} -{ add.period$ - write$ - newline$ -} - -FUNCTION {new.block} -{ output.state before.all = - 'skip$ - { after.block 'output.state := } - if$ -} - -FUNCTION {new.sentence} -{ output.state after.block = - 'skip$ - { output.state before.all = - 'skip$ - { after.sentence 'output.state := } - if$ - } - if$ -} - -FUNCTION {not} -{ { #0 } - { #1 } - if$ -} - -FUNCTION {and} -{ 'skip$ - { pop$ #0 } - if$ -} - -FUNCTION {or} -{ { pop$ #1 } - 'skip$ - if$ -} - -FUNCTION {new.block.checka} -{ empty$ - 'skip$ - 'new.block - if$ -} - -FUNCTION {new.block.checkb} -{ empty$ - swap$ empty$ - and - 'skip$ - 'new.block - if$ -} - -FUNCTION {new.sentence.checka} -{ empty$ - 'skip$ - 'new.sentence - if$ -} - -FUNCTION {new.sentence.checkb} -{ empty$ - swap$ empty$ - and - 'skip$ - 'new.sentence - if$ -} - -FUNCTION {field.or.null} -{ duplicate$ empty$ - { pop$ "" } - 'skip$ - if$ -} - -FUNCTION {emphasize} -{ duplicate$ empty$ - { pop$ "" } - { "{\em " swap$ * "}" * } - if$ -} - -INTEGERS { nameptr namesleft numnames } - -FUNCTION {format.names} -{ 's := - #1 'nameptr := - s num.names$ 'numnames := - numnames 'namesleft := - { namesleft #0 > } - { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := - nameptr #1 > - { namesleft #1 > - { ", " * t * } - { numnames #2 > - { "," * } - 'skip$ - if$ - t "others" = - { " et~al." * } - { " and " * t * } - if$ - } - if$ - } - 't - if$ - nameptr #1 + 'nameptr := - namesleft #1 - 'namesleft := - } - while$ -} - -FUNCTION {format.authors} -{ author empty$ - { "" } - { author format.names } - if$ -} - -FUNCTION {format.editors} -{ editor empty$ - { "" } - { editor format.names - editor num.names$ #1 > - { ", editors" * } - { ", editor" * } - if$ - } - if$ -} - -FUNCTION {format.title} -{ title empty$ - { "" } - { title "t" change.case$ } - if$ -} - -FUNCTION {n.dashify} -{ 't := - "" - { t empty$ not } - { t #1 #1 substring$ "-" = - { t #1 #2 substring$ "--" = not - { "--" * - t #2 global.max$ substring$ 't := - } - { { t #1 #1 substring$ "-" = } - { "-" * - t #2 global.max$ substring$ 't := - } - while$ - } - if$ - } - { t #1 #1 substring$ * - t #2 global.max$ substring$ 't := - } - if$ - } - while$ -} - -FUNCTION {format.date} -{ year empty$ - { month empty$ - { "" } - { "there's a month but no year in " cite$ * warning$ - month - } - if$ - } - { month empty$ - 'year - { month " " * year * } - if$ - } - if$ -} - -FUNCTION {format.btitle} -{ title emphasize -} - -FUNCTION {tie.or.space.connect} -{ duplicate$ text.length$ #3 < - { "~" } - { " " } - if$ - swap$ * * -} - -FUNCTION {either.or.check} -{ empty$ - 'pop$ - { "can't use both " swap$ * " fields in " * cite$ * warning$ } - if$ -} - -FUNCTION {format.bvolume} -{ volume empty$ - { "" } - { "volume" volume tie.or.space.connect - series empty$ - 'skip$ - { " of " * series emphasize * } - if$ - "volume and number" number either.or.check - } - if$ -} - -FUNCTION {format.number.series} -{ volume empty$ - { number empty$ - { series field.or.null } - { output.state mid.sentence = - { "number" } - { "Number" } - if$ - number tie.or.space.connect - series empty$ - { "there's a number but no series in " cite$ * warning$ } - { " in " * series * } - if$ - } - if$ - } - { "" } - if$ -} - -FUNCTION {format.edition} -{ edition empty$ - { "" } - { output.state mid.sentence = - { edition "l" change.case$ " edition" * } - { edition "t" change.case$ " edition" * } - if$ - } - if$ -} - -INTEGERS { multiresult } - -FUNCTION {multi.page.check} -{ 't := - #0 'multiresult := - { multiresult not - t empty$ not - and - } - { t #1 #1 substring$ - duplicate$ "-" = - swap$ duplicate$ "," = - swap$ "+" = - or or - { #1 'multiresult := } - { t #2 global.max$ substring$ 't := } - if$ - } - while$ - multiresult -} - -FUNCTION {format.pages} -{ pages empty$ - { "" } - { pages multi.page.check - { "pages" pages n.dashify tie.or.space.connect } - { "page" pages tie.or.space.connect } - if$ - } - if$ -} - -FUNCTION {format.vol.num.pages} -{ volume field.or.null - number empty$ - 'skip$ - { "(" number * ")" * * - volume empty$ - { "there's a number but no volume in " cite$ * warning$ } - 'skip$ - if$ - } - if$ - pages empty$ - 'skip$ - { duplicate$ empty$ - { pop$ format.pages } - { ":" * pages n.dashify * } - if$ - } - if$ -} - -FUNCTION {format.chapter.pages} -{ chapter empty$ - 'format.pages - { type empty$ - { "chapter" } - { type "l" change.case$ } - if$ - chapter tie.or.space.connect - pages empty$ - 'skip$ - { ", " * format.pages * } - if$ - } - if$ -} - -FUNCTION {format.in.ed.booktitle} -{ booktitle empty$ - { "" } - { editor empty$ - { "In " booktitle emphasize * } - { "In " format.editors * ", " * booktitle emphasize * } - if$ - } - if$ -} - -FUNCTION {empty.misc.check} -{ author empty$ title empty$ howpublished empty$ - month empty$ year empty$ note empty$ - and and and and and - key empty$ not and - { "all relevant fields are empty in " cite$ * warning$ } - 'skip$ - if$ -} - -FUNCTION {format.thesis.type} -{ type empty$ - 'skip$ - { pop$ - type "t" change.case$ - } - if$ -} - -FUNCTION {format.tr.number} -{ type empty$ - { "Technical Report" } - 'type - if$ - number empty$ - { "t" change.case$ } - { number tie.or.space.connect } - if$ -} - -FUNCTION {format.article.crossref} -{ key empty$ - { journal empty$ - { "need key or journal for " cite$ * " to crossref " * crossref * - warning$ - "" - } - { "In {\em " journal * "\/}" * } - if$ - } - { "In " key * } - if$ - " \cite{" * crossref * "}" * -} - -FUNCTION {format.crossref.editor} -{ editor #1 "{vv~}{ll}" format.name$ - editor num.names$ duplicate$ - #2 > - { pop$ " et~al." * } - { #2 < - 'skip$ - { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = - { " et~al." * } - { " and " * editor #2 "{vv~}{ll}" format.name$ * } - if$ - } - if$ - } - if$ -} - -FUNCTION {format.book.crossref} -{ volume empty$ - { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ - "In " - } - { "Volume" volume tie.or.space.connect - " of " * - } - if$ - editor empty$ - editor field.or.null author field.or.null = - or - { key empty$ - { series empty$ - { "need editor, key, or series for " cite$ * " to crossref " * - crossref * warning$ - "" * - } - { "{\em " * series * "\/}" * } - if$ - } - { key * } - if$ - } - { format.crossref.editor * } - if$ - " \cite{" * crossref * "}" * -} - -FUNCTION {format.incoll.inproc.crossref} -{ editor empty$ - editor field.or.null author field.or.null = - or - { key empty$ - { booktitle empty$ - { "need editor, key, or booktitle for " cite$ * " to crossref " * - crossref * warning$ - "" - } - { "In {\em " booktitle * "\/}" * } - if$ - } - { "In " key * } - if$ - } - { "In " format.crossref.editor * } - if$ - " \cite{" * crossref * "}" * -} - -FUNCTION {article} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - crossref missing$ - { journal emphasize "journal" output.check - format.vol.num.pages output - format.date "year" output.check - } - { format.article.crossref output.nonnull - format.pages output - } - if$ - new.block - note output - fin.entry -} - -FUNCTION {book} -{ output.bibitem - author empty$ - { format.editors "author and editor" output.check } - { format.authors output.nonnull - crossref missing$ - { "author and editor" editor either.or.check } - 'skip$ - if$ - } - if$ - new.block - format.btitle "title" output.check - crossref missing$ - { format.bvolume output - new.block - format.number.series output - new.sentence - publisher "publisher" output.check - address output - } - { new.block - format.book.crossref output.nonnull - } - if$ - format.edition output - format.date "year" output.check - new.block - note output - fin.entry -} - -FUNCTION {booklet} -{ output.bibitem - format.authors output - new.block - format.title "title" output.check - howpublished address new.block.checkb - howpublished output - address output - format.date output - new.block - note output - fin.entry -} - -FUNCTION {inbook} -{ output.bibitem - author empty$ - { format.editors "author and editor" output.check } - { format.authors output.nonnull - crossref missing$ - { "author and editor" editor either.or.check } - 'skip$ - if$ - } - if$ - new.block - format.btitle "title" output.check - crossref missing$ - { format.bvolume output - format.chapter.pages "chapter and pages" output.check - new.block - format.number.series output - new.sentence - publisher "publisher" output.check - address output - } - { format.chapter.pages "chapter and pages" output.check - new.block - format.book.crossref output.nonnull - } - if$ - format.edition output - format.date "year" output.check - new.block - note output - fin.entry -} - -FUNCTION {incollection} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - crossref missing$ - { format.in.ed.booktitle "booktitle" output.check - format.bvolume output - format.number.series output - format.chapter.pages output - new.sentence - publisher "publisher" output.check - address output - format.edition output - format.date "year" output.check - } - { format.incoll.inproc.crossref output.nonnull - format.chapter.pages output - } - if$ - new.block - note output - fin.entry -} - -FUNCTION {inproceedings} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - crossref missing$ - { format.in.ed.booktitle "booktitle" output.check - format.bvolume output - format.number.series output - format.pages output - address empty$ - { organization publisher new.sentence.checkb - organization output - publisher output - format.date "year" output.check - } - { address output.nonnull - format.date "year" output.check - new.sentence - organization output - publisher output - } - if$ - } - { format.incoll.inproc.crossref output.nonnull - format.pages output - } - if$ - new.block - note output - fin.entry -} - -FUNCTION {conference} { inproceedings } - -FUNCTION {manual} -{ output.bibitem - author empty$ - { organization empty$ - 'skip$ - { organization output.nonnull - address output - } - if$ - } - { format.authors output.nonnull } - if$ - new.block - format.btitle "title" output.check - author empty$ - { organization empty$ - { address new.block.checka - address output - } - 'skip$ - if$ - } - { organization address new.block.checkb - organization output - address output - } - if$ - format.edition output - format.date output - new.block - note output - fin.entry -} - -FUNCTION {mastersthesis} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - "Master's thesis" format.thesis.type output.nonnull - school "school" output.check - address output - format.date "year" output.check - new.block - note output - fin.entry -} - -FUNCTION {misc} -{ output.bibitem - format.authors output - title howpublished new.block.checkb - format.title output - howpublished new.block.checka - howpublished output - format.date output - new.block - note output - fin.entry - empty.misc.check -} - -FUNCTION {phdthesis} -{ output.bibitem - format.authors "author" output.check - new.block - format.btitle "title" output.check - new.block - "PhD thesis" format.thesis.type output.nonnull - school "school" output.check - address output - format.date "year" output.check - new.block - note output - fin.entry -} - -FUNCTION {proceedings} -{ output.bibitem - editor empty$ - { organization output } - { format.editors output.nonnull } - if$ - new.block - format.btitle "title" output.check - format.bvolume output - format.number.series output - address empty$ - { editor empty$ - { publisher new.sentence.checka } - { organization publisher new.sentence.checkb - organization output - } - if$ - publisher output - format.date "year" output.check - } - { address output.nonnull - format.date "year" output.check - new.sentence - editor empty$ - 'skip$ - { organization output } - if$ - publisher output - } - if$ - new.block - note output - fin.entry -} - -FUNCTION {techreport} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - format.tr.number output.nonnull - institution "institution" output.check - address output - format.date "year" output.check - new.block - note output - fin.entry -} - -FUNCTION {unpublished} -{ output.bibitem - format.authors "author" output.check - new.block - format.title "title" output.check - new.block - note "note" output.check - format.date output - fin.entry -} - -FUNCTION {default.type} { misc } - -MACRO {jan} {"Jan."} - -MACRO {feb} {"Feb."} - -MACRO {mar} {"Mar."} - -MACRO {apr} {"Apr."} - -MACRO {may} {"May"} - -MACRO {jun} {"June"} - -MACRO {jul} {"July"} - -MACRO {aug} {"Aug."} - -MACRO {sep} {"Sept."} - -MACRO {oct} {"Oct."} - -MACRO {nov} {"Nov."} - -MACRO {dec} {"Dec."} - -MACRO {acmcs} {"ACM Comput. Surv."} - -MACRO {acta} {"Acta Inf."} - -MACRO {cacm} {"Commun. ACM"} - -MACRO {ibmjrd} {"IBM J. Res. Dev."} - -MACRO {ibmsj} {"IBM Syst.~J."} - -MACRO {ieeese} {"IEEE Trans. Softw. Eng."} - -MACRO {ieeetc} {"IEEE Trans. Comput."} - -MACRO {ieeetcad} - {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} - -MACRO {ipl} {"Inf. Process. Lett."} - -MACRO {jacm} {"J.~ACM"} - -MACRO {jcss} {"J.~Comput. Syst. Sci."} - -MACRO {scp} {"Sci. Comput. Programming"} - -MACRO {sicomp} {"SIAM J. Comput."} - -MACRO {tocs} {"ACM Trans. Comput. Syst."} - -MACRO {tods} {"ACM Trans. Database Syst."} - -MACRO {tog} {"ACM Trans. Gr."} - -MACRO {toms} {"ACM Trans. Math. Softw."} - -MACRO {toois} {"ACM Trans. Office Inf. Syst."} - -MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} - -MACRO {tcs} {"Theoretical Comput. Sci."} - -READ - -FUNCTION {sortify} -{ purify$ - "l" change.case$ -} - -INTEGERS { len } - -FUNCTION {chop.word} -{ 's := - 'len := - s #1 len substring$ = - { s len #1 + global.max$ substring$ } - 's - if$ -} - -FUNCTION {sort.format.names} -{ 's := - #1 'nameptr := - "" - s num.names$ 'numnames := - numnames 'namesleft := - { namesleft #0 > } - { nameptr #1 > - { " " * } - 'skip$ - if$ - s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := - nameptr numnames = t "others" = and - { "et al" * } - { t sortify * } - if$ - nameptr #1 + 'nameptr := - namesleft #1 - 'namesleft := - } - while$ -} - -FUNCTION {sort.format.title} -{ 't := - "A " #2 - "An " #3 - "The " #4 t chop.word - chop.word - chop.word - sortify - #1 global.max$ substring$ -} - -FUNCTION {author.sort} -{ author empty$ - { key empty$ - { "to sort, need author or key in " cite$ * warning$ - "" - } - { key sortify } - if$ - } - { author sort.format.names } - if$ -} - -FUNCTION {author.editor.sort} -{ author empty$ - { editor empty$ - { key empty$ - { "to sort, need author, editor, or key in " cite$ * warning$ - "" - } - { key sortify } - if$ - } - { editor sort.format.names } - if$ - } - { author sort.format.names } - if$ -} - -FUNCTION {author.organization.sort} -{ author empty$ - { organization empty$ - { key empty$ - { "to sort, need author, organization, or key in " cite$ * warning$ - "" - } - { key sortify } - if$ - } - { "The " #4 organization chop.word sortify } - if$ - } - { author sort.format.names } - if$ -} - -FUNCTION {editor.organization.sort} -{ editor empty$ - { organization empty$ - { key empty$ - { "to sort, need editor, organization, or key in " cite$ * warning$ - "" - } - { key sortify } - if$ - } - { "The " #4 organization chop.word sortify } - if$ - } - { editor sort.format.names } - if$ -} - -FUNCTION {presort} -{ type$ "book" = - type$ "inbook" = - or - 'author.editor.sort - { type$ "proceedings" = - 'editor.organization.sort - { type$ "manual" = - 'author.organization.sort - 'author.sort - if$ - } - if$ - } - if$ - " " - * - year field.or.null sortify - * - " " - * - title field.or.null - sort.format.title - * - #1 entry.max$ substring$ - 'sort.key$ := -} - -ITERATE {presort} - -SORT - -STRINGS { longest.label } - -INTEGERS { number.label longest.label.width } - -FUNCTION {initialize.longest.label} -{ "" 'longest.label := - #1 'number.label := - #0 'longest.label.width := -} - -FUNCTION {longest.label.pass} -{ number.label int.to.str$ 'label := - number.label #1 + 'number.label := - label width$ longest.label.width > - { label 'longest.label := - label width$ 'longest.label.width := - } - 'skip$ - if$ -} - -EXECUTE {initialize.longest.label} - -ITERATE {longest.label.pass} - -FUNCTION {begin.bib} -{ preamble$ empty$ - 'skip$ - { preamble$ write$ newline$ } - if$ - "\begin{thebibliography}{" longest.label * - "}\setlength{\itemsep}{-1ex}\small" * write$ newline$ -} - -EXECUTE {begin.bib} - -EXECUTE {init.state.consts} - -ITERATE {call.type$} - -FUNCTION {end.bib} -{ newline$ - "\end{thebibliography}" write$ newline$ -} - -EXECUTE {end.bib} - -% end of file ieee.bst -% --------------------------------------------------------------- diff --git a/lustre/portals/doc/mpi.fig b/lustre/portals/doc/mpi.fig deleted file mode 100644 index e1a91b5..0000000 --- a/lustre/portals/doc/mpi.fig +++ /dev/null @@ -1,117 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -6 150 1650 900 2025 -4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001 -4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001 --6 -6 150 150 900 525 -4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001 -4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001 --6 -6 2550 4125 3150 4725 -4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001 -4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001 -4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001 --6 -6 1050 1575 1950 1875 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1050 1575 1950 1575 1950 1875 1050 1875 1050 1575 -4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001 --6 -6 5400 1575 6300 2175 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 5400 1575 6300 1575 6300 2175 5400 2175 5400 1575 -4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001 --6 -6 5400 2400 6300 3000 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 5400 2400 6300 2400 6300 3000 5400 3000 5400 2400 -4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001 --6 -6 1050 2400 1950 2700 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1050 2400 1950 2400 1950 2700 1050 2700 1050 2400 -4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001 --6 -6 1050 825 1950 1125 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1050 825 1950 825 1950 1125 1050 1125 1050 825 -4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001 --6 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1500 1125 1500 1575 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3225 2025 4050 3375 -2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 - 150 675 6600 675 -2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 - 150 1350 6600 1350 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2400 4125 3300 4125 3300 4725 2400 4725 2400 4125 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3225 4500 4050 3675 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3225 1725 5400 1725 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3225 2550 5400 2550 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3225 2850 4050 3450 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1500 1800 1500 2400 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2400 825 3300 825 3300 1275 2400 1275 2400 825 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1500 2625 1500 4125 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1050 4125 1950 4125 1950 4425 1050 4425 1050 4125 -2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1500 300 1500 825 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1875 975 2400 975 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1875 1725 2400 1725 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1875 2550 2400 2550 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 1875 4275 2400 4275 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2400 1575 3300 1575 3300 2175 2400 2175 2400 1575 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2400 2400 3300 2400 3300 3000 2400 3000 2400 2400 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 4050 3300 5250 3300 5250 3750 4050 3750 4050 3300 -4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001 -4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001 -4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001 -4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001 -4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001 -4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001 -4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001 -4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001 -4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001 -4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001 -4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001 -4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001 -4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001 -4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001 -4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001 diff --git a/lustre/portals/doc/portals.fig b/lustre/portals/doc/portals.fig deleted file mode 100644 index 9b1271b..0000000 --- a/lustre/portals/doc/portals.fig +++ /dev/null @@ -1,68 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1350 900 1650 900 1650 1200 1350 1200 1350 900 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 1800 1350 2100 1350 2100 1650 1800 1650 1800 1350 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2250 1800 2550 1800 2550 2100 2250 2100 2250 1800 -2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 - 4200 375 4200 2100 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 525 600 1125 600 1125 2100 525 2100 525 600 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 4425 1275 4875 1275 4875 1950 4425 1950 4425 1275 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 2550 1200 3150 1200 3150 1500 2550 1500 2550 1200 -2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3000 1425 4425 1425 -2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 - 3600 825 3750 825 3750 1125 3600 1125 3600 825 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 2025 1425 2550 1425 -2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 - 4425 750 4875 750 4875 1125 4425 1125 4425 750 -2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 3675 975 4425 975 -3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2 - 0 0 1.00 60.00 120.00 - 825 1050 1350 1050 - 0.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 - 0 0 1.00 60.00 120.00 - 1500 1125 1500 1350 1500 1500 1650 1500 1800 1500 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 - 0 0 1.00 60.00 120.00 - 1950 1575 1950 1800 1950 1950 2100 1950 2250 1950 - 0.000 1.000 1.000 1.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 - 525 975 1125 975 - 0.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 - 525 1125 1125 1125 - 0.000 0.000 -3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7 - 0 0 1.00 60.00 120.00 - 3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975 - 3600 975 - 0.000 1.000 1.000 1.000 1.000 1.000 0.000 -4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001 -4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001 -4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001 -4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001 -4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001 -4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001 -4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001 -4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001 -4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001 diff --git a/lustre/portals/doc/portals3.bib b/lustre/portals/doc/portals3.bib deleted file mode 100644 index 323b99f..0000000 --- a/lustre/portals/doc/portals3.bib +++ /dev/null @@ -1,124 +0,0 @@ -@Article{ Cplant, - title = { {M}assively {P}arallel {C}omputing with - {C}ommodity {C}omponents }, - author = { Ron Brightwell and David S. Greenberg and Arthur - B. Maccabe and Rolf Riesen }, - journal = { Parallel Computing }, - volume = { 26 }, - month = { February }, - pages = { 243-266 }, - year = { 2000 } -} - -@Manual{ Portals, - organization = { Sandia National Laboratories }, - title = { {P}uma {P}ortals }, - note = { http://www.cs.sandia.gov/puma/portals }, - year = { 1997 } -} - -@Techreport{ VIA, - title = { {V}irtual {I}nterface {A}rchitecture - {S}pecification {V}ersion 1.0 }, - author = { {Compaq, Microsoft, and Intel} }, - institution = { Compaq, Microsoft, and Intel }, - month = { December }, - year = { 1997 } -} - -@Techreport{ ST, - title = { {I}nformation {T}echnology - {S}cheduled - {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 }, - author = { {Task Group of Technical Committee T11} }, - institution = { Accredited Standards Committee NCITS }, - month = { July }, - year = { 1998 } -} - -@Manual{ TFLOPS, - organization = { Sandia National Laboratories }, - title = { ASCI Red }, - note = { http://www.sandia.gov/ASCI/TFLOP }, - year = { 1996 } -} - -@Techreport{ GM, - title = { The {GM} {M}essage {P}assing {S}ystem }, - author = { {Myricom, Inc.} }, - institution = { {Myricom, Inc.} }, - year = { 1997 }, -} - -@Article{ MPIstandard, - title = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard }, - author = { {Message Passing Interface Forum} }, - journal = { The International Journal of Supercomputer Applications - and High Performance Computing }, - volume = { 8 }, - year = { 1994 } -} - -@Inproceedings{ PumaOS, - author = "Lance Shuler and Chu Jong and Rolf Riesen and - David van Dresser and Arthur B. Maccabe and - Lee Ann Fisk and T. Mack Stallcup", - booktitle = "Proceeding of the 1995 Intel Supercomputer - User's Group Conference", - title = "The {P}uma Operating System for Massively Parallel Computers", - organization = "Intel Supercomputer User's Group", - year = 1995 -} - -@InProceedings{ SUNMOS, -author = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and - Stephen R. Wheat", -title = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide", -booktitle = "Proceedings of the {Intel} Supercomputer Users' Group. 1994 - Annual North America Users' Conference.", -year = 1994, -pages = "245--251", -month = "June", -location = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps" -} - -@InProceedings { PumaMPI, - title = { Design and Implementation of {MPI} on {P}uma Portals }, - author = { Ron Brightwell and Lance Shuler }, - booktitle = { Proceedings of the Second MPI Developer's Conference }, - pages = { 18-25 }, - month = { July }, - year = { 1996 } -} - -@Inproceedings{ FM2, - author = { Mario Lauria and Scott Pakin and Andrew Chien }, - title = { {E}fficient {L}ayering for {H}igh {S}peed - {C}ommunication: {F}ast {M}essages 2.x }, - Booktitle = { Proceedings of the IEEE International Symposium - on High Performance Distributed Computing }, - year = { 1998 } -} - -@Manual { CraySHMEM, - title = "SHMEM Technical Note for C, SG-2516 2.3", - organization = "Cray Research, Inc.", - month = "October", - year = 1994 -} - -@Manual { MPI2, - title = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface", - organization = "Message Passing Interface Forum", - note = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html", - month = "July", - year = 1997 -} - -@InProceedings { PMMPI, - title = { {The Design and Implementation of Zero Copy MPI Using - Commodity Hardware with a High Performance Network} }, - author = { Francis O'Carroll and Hiroshi Tezuka and Atsushi Hori - and Yutaka Ishikawa }, - booktitle = { Proceedings of the ICS }, - year = { 1998 } -} diff --git a/lustre/portals/doc/portals3.lyx b/lustre/portals/doc/portals3.lyx deleted file mode 100644 index 8429280..0000000 --- a/lustre/portals/doc/portals3.lyx +++ /dev/null @@ -1,15944 +0,0 @@ -#LyX 1.2 created this file. For more info see http://www.lyx.org/ -\lyxformat 220 -\textclass report -\begin_preamble -\usepackage{fullpage} -\renewenvironment{comment}% -{\begin{quote}\textbf{Discussion}: \slshape}% -{\end{quote}} -\pagestyle{myheadings} -\end_preamble -\language american -\inputencoding auto -\fontscheme pslatex -\graphics default -\paperfontsize 10 -\spacing single -\papersize letterpaper -\paperpackage a4 -\use_geometry 0 -\use_amsmath 0 -\use_natbib 0 -\use_numerical_citations 0 -\paperorientation portrait -\secnumdepth 2 -\tocdepth 2 -\paragraph_separation indent -\defskip medskip -\quotes_language english -\quotes_times 2 -\papercolumns 1 -\papersides 2 -\paperpagestyle headings - -\layout Title - -The Portals 3.2 Message Passing Interface -\newline - Revision 1.1 -\layout Author - -Ron Brightwell -\begin_inset Foot -collapsed true - -\layout Standard - -R. - Brightwell and R. - Riesen are with the Scalable Computing Systems Department, Sandia National - Laboratories, P.O. - Box 5800, Albuquerque, NM\SpecialChar ~ -\SpecialChar ~ -87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov. -\end_inset - -, Arthur B. - Maccabe -\begin_inset Foot -collapsed true - -\layout Standard - -A. - B. - Maccabe is with the Computer Science Department, University of New Mexico, - Albuquerque, NM\SpecialChar ~ -\SpecialChar ~ -87131-1386, maccabe@cs.unm.edu. -\end_inset - -, Rolf Riesen and Trammell Hudson -\layout Abstract - -This report presents a specification for the Portals 3.2 message passing - interface. - Portals 3.2 is intended to allow scalable, high-performance network communicatio -n between nodes of a parallel computing system. - Specifically, it is designed to support a parallel computing platform composed - of clusters of commodity workstations connected by a commodity system area - network fabric. - In addition, Portals 3.2 is well suited to massively parallel processing - and embedded systems. - Portals 3.2 represents an adaption of the data movement layer developed - for massively parallel processing platforms, such as the 4500-node Intel - TeraFLOPS machine. - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -clearpage -\backslash -pagenumbering{roman} -\backslash -setcounter{page}{3} -\end_inset - - -\layout Standard - - -\begin_inset LatexCommand \tableofcontents{} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -cleardoublepage -\end_inset - - -\layout Standard - - -\begin_inset FloatList figure - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -cleardoublepage -\end_inset - - -\layout Standard - - -\begin_inset FloatList table - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -cleardoublepage -\end_inset - - -\layout Chapter* - -Summary of Changes for Revision 1.1 -\layout Enumerate - -Updated version number to 3.2 throughout the document -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sub:PtlGetId} - -\end_inset - -: added -\family typewriter -PTL_SEGV -\family default - to error list for -\shape italic -PtlGetId -\shape default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -: added -\family typewriter -PTL_ML_TOOLONG -\family default - to error list for -\shape italic -PtlMEAttach -\shape default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:meunlink} - -\end_inset - -: removed text referring to a list of associated memory descriptors. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:mdfree} - -\end_inset - -: added text to describe unlinking a free-floating memory descriptor. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:types} - -\end_inset - -: added entry for -\family typewriter -ptl_seq_t -\family default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - -: -\begin_deeper -\layout Enumerate - -added definition of -\family typewriter -max_offset -\family default -. -\layout Enumerate - -added text to clarify -\family typewriter -PTL_MD_MANAGE_REMOTE -\family default -. -\end_deeper -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:mdattach} - -\end_inset - -: modified text for -\family typewriter -unlink_op -\family default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:niinit} - -\end_inset - -: added text to clarify multiple calls to -\shape italic -PtlNIInit -\shape default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:mdattach} - -\end_inset - -: added text to clarify -\family typewriter -unlink_nofit -\family default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:receiving} - -\end_inset - -: removed text indicating that an MD will reject a message if the associated - EQ is full. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:mdfree} - -\end_inset - -: added -\family typewriter -PTL_MD_INUSE -\family default - error code and text to indicate that only MDs with no pending operations - can be unlinked. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:retcodes} - -\end_inset - -: added -\family typewriter -PTL_MD_INUSE -\family default - return code. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:event-type} - -\end_inset - -: added user id field, MD handle field, and NI specific failure field to - the -\family typewriter -ptl_event_t -\family default - structure. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:types} - -\end_inset - -: added -\family typewriter -ptl_ni_fail_t -\family default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:event-type} - -\end_inset - -: added -\family typewriter -PTL_EVENT_UNLINK -\family default - event type. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:func} - -\end_inset - -: removed -\shape slanted -PtlTransId -\shape default -. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -, Section -\begin_inset LatexCommand \ref{sec:meinsert} - -\end_inset - -, Section -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - -: listed allowable constants with relevant fields. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:func} - -\end_inset - -: added -\shape italic -PtlMEAttachAny -\shape default - function. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:retcodes} - -\end_inset - -: added -\family typewriter -PTL_PT_FULL -\family default - return code for -\shape italic -PtlMEAttachAny -\shape default -. -\layout Enumerate - -Table -\begin_inset LatexCommand \ref{tab:oconsts} - -\end_inset - -: updated to reflect new event types. -\layout Enumerate - -Section -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - -: added -\family typewriter -ptl_nid_t -\family default -, -\family typewriter -ptl_pid_t -\family default -, and -\family typewriter -ptl_uid_t -\family default -. -\layout Chapter* - -Summary of Changes for Version 3.1 -\layout Section* - -Thread Issues -\layout Standard - -The most significant change to the interface from version 3.0 to 3.1 involves - the clarification of how the interface interacts with multi-threaded applicatio -ns. - We adopted a generic thread model in which processes define an address - space and threads share the address space. - Consideration of the API in the light of threads lead to several clarifications - throughout the document: -\layout Enumerate - -Glossary: -\begin_deeper -\layout Enumerate - -added a definition for -\emph on -thread -\emph default -, -\layout Enumerate - -reworded the definition for -\emph on -process -\emph default -. - -\end_deeper -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:apiover} - -\end_inset - -: added section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:threads} - -\end_inset - - to describe the multi-threading model used by the Portals API. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ptlinit} - -\end_inset - -: -\emph on -PtlInit -\emph default - must be called at least once and may be called any number of times. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ptlfini} - -\end_inset - -: -\emph on -PtlFini -\emph default - should be called once as the process is terminating and not as each thread - terminates. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:pid} - -\end_inset - -: Portals does not define thread ids. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - -: network interfaces are associated with processes, not threads. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:niinit} - -\end_inset - -: -\emph on -PtlNIInit -\emph default - must be called at least once and may be called any number of times. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:eqget} - -\end_inset - -: -\emph on -PtlEQGet -\emph default - returns -\family typewriter -PTL_EQ_EMPTY -\family default - if a thread is blocked on -\emph on -PtlEQWait -\emph default -. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:eqwait} - -\end_inset - -: waiting threads are awakened in FIFO order. - -\layout Standard - -Two functions, -\emph on -PtlNIBarrier -\emph default - and -\emph on -PtlEQCount -\emph default - were removed from the API. - -\emph on -PtlNIBarrier -\emph default - was defined to block the calling process until all of the processes in - the application group had invoked -\emph on -PtlNIBarrier -\emph default -. - We now consider this functionality, along with the concept of groups (see - the discussion under -\begin_inset Quotes eld -\end_inset - -other changes -\begin_inset Quotes erd -\end_inset - -), to be part of the runtime system, not part of the Portals API. - -\emph on -PtlEQCount -\emph default - was defined to return the number of events in an event queue. - Because external operations may lead to new events being added and other - threads may remove events, the value returned by -\emph on -PtlEQCount -\emph default - would have to be a hint about the number of events in the event queue. -\layout Section* - -Handling small, unexpected messages -\layout Standard - -Another set of changes relates to handling small unexpected messages in - MPI. - In designing version 3.0, we assumed that each unexpected message would - be placed in a unique memory descriptor. - To avoid the need to process a long list of memory descriptors, we moved - the memory descriptors out of the match list and hung them off of a single - match list entry. - In this way, large unexpected messages would only encounter a single -\begin_inset Quotes eld -\end_inset - -short message -\begin_inset Quotes erd -\end_inset - - match list entry before encountering the -\begin_inset Quotes eld -\end_inset - -long message -\begin_inset Quotes erd -\end_inset - - match list entry. - Experience with this strategy identified resource management problems with - this approach. - In particular, a long sequence of very short (or zero length) messages - could quickly exhaust the memory descriptors constructed for handling unexpecte -d messages. - Our new strategy involves the use of several very large memory descriptors - for small unexpected messages. - Consecutive unexpected messages will be written into the first of these - memory descriptors until the memory descriptor fills up. - When the first of the -\begin_inset Quotes eld -\end_inset - -small memory -\begin_inset Quotes erd -\end_inset - - descriptors fills up, it will be unlinked and subsequent short messages - will be written into the next -\begin_inset Quotes eld -\end_inset - -short message -\begin_inset Quotes erd -\end_inset - - memory descriptor. - In this case, a -\begin_inset Quotes eld -\end_inset - -short message -\begin_inset Quotes erd -\end_inset - - memory descriptor will be declared full when it does not have sufficient - space for the largest small unexpected message. -\layout Standard - -This lead to two significant changes. - First, each match list entry now has a single memory descriptor rather - than a list of memory descriptors. - Second, in addition to exceeding the operation threshold, a memory descriptor - can be unlinked when the local offset exceeds a specified value. - These changes have lead to several changes in this document: -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{subsec:paddress} - -\end_inset - -: -\begin_deeper -\layout Enumerate - -removed references to the memory descriptor list, -\layout Enumerate - -changed the portals address translation description to indicate that unlinking - a memory descriptor implies unlinking the associated match list entry--match - list entries can no longer be unlinked independently from the memory descriptor. - -\end_deeper -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -: -\begin_deeper -\layout Enumerate - -removed unlink from argument list, -\layout Enumerate - -removed description of -\family typewriter -ptl_unlink -\family default - type, -\layout Enumerate - -changed wording of the error condition when the Portal table index already - has an associated match list. - -\end_deeper -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:meinsert} - -\end_inset - -: removed unlink from argument list. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - -: added -\family typewriter -max_offset -\family default -. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdattach} - -\end_inset - -: -\begin_deeper -\layout Enumerate - -added description of -\family typewriter -ptl_unlink -\family default - type, -\layout Enumerate - -removed reference to memory descriptor lists, -\layout Enumerate - -changed wording of the error condition when match list entry already has - an associated memory descriptor, -\layout Enumerate - -changed the description of the -\family typewriter -unlink -\family default - argument. - -\end_deeper -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - -: removed -\family typewriter -PtlMDInsert -\family default - operation. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdbind} - -\end_inset - -: removed references to memory descriptor list. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdfree} - -\end_inset - -: removed reference to memory descriptor list. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:summary} - -\end_inset - -: removed references to PtlMDInsert. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:semantics} - -\end_inset - -: removed reference to memory descriptor list. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:exmpi} - -\end_inset - -: revised the MPI example to reflect the changes to the interface. - -\layout Standard - -Several changes have been made to improve the general documentation of the - interface. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - -: documented the special value -\family typewriter -PTL_EQ_NONE -\family default -. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - -: documented the special value -\family typewriter -PTL_ID_ANY -\family default -. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdbind} - -\end_inset - -: documented the return value -\family typewriter -PTL_INV_EQ -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdupdate} - -\end_inset - -: clarified the description of the -\emph on -PtlMDUpdate -\emph default - function. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:implvals} - -\end_inset - -: introduced a new section to document the implementation defined values. - -\layout Enumerate - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:summary} - -\end_inset - -: modified Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:oconsts} - -\end_inset - - to indicate where each constant is introduced and where it is used. - -\layout Section* - -Other changes -\layout Subsection* - -Implementation defined limits (Section -\begin_inset LatexCommand \ref{sec:niinit} - -\end_inset - -) -\layout Standard - -The earlier version provided implementation defined limits for the maximum - number of match entries, the maximum number of memory descriptors, etc. - Rather than spanning the entire implementation, these limits are now associated - with individual network interfaces. -\layout Subsection* - -Added User Ids (Section -\begin_inset LatexCommand \ref{sec:uid} - -\end_inset - -) -\layout Standard - -Group Ids had been used to simplify access control entries. - In particular, a process could allow access for all of the processes in - a group. - User Ids have been introduced to regain this functionality. - We use user ids to fill this role. -\layout Subsection* - -Removed Group Ids and Rank Ids (Section -\begin_inset LatexCommand \ref{sec:pid} - -\end_inset - -) -\layout Standard - -The earlier version of Portals had two forms for addressing processes: and . - A process group was defined as the collection processes created during - application launch. - Each process in the group was given a unique rank id in the range 0 to - -\begin_inset Formula $n-1$ -\end_inset - - where -\begin_inset Formula $n$ -\end_inset - - was the number of processes in the group. - We removed groups because they are better handled in the runtime system. -\layout Subsection* - -Match lists (Section -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -) -\layout Standard - -It is no longer illegal to have an existing match entry when calling PtlMEAttach. - A position argument was added to the list of arguments supplied to -\emph on -PtlMEAttach -\emph default - to specify whether the new match entry is prepended or appended to the - existing list. - If there is no existing match list, the position argument is ignored. -\layout Subsection* - -Unlinking Memory Descriptors (Section -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - -) -\layout Standard - -Previously, a memory descriptor could be unlinked if the offset exceeded - a threshold upon the completion of an operation. - In this version, the unlinking is delayed until there is a matching operation - which requires more memory than is currently available in the descriptor. - In addition to changes in section, this lead to a revision of Figure\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:flow} - -\end_inset - -. -\layout Subsection* - -Split Phase Operations and Events (Section -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - -) -\layout Standard - -Previously, there were five types of events: -\family typewriter -PTL_EVENT_PUT -\family default -, -\family typewriter -PTL_EVENT_GET -\family default -, -\family typewriter -PTL_EVENT_REPLY -\family default -, -\family typewriter -PTL_EVENT_SENT -\family default -, and -\family typewriter -PTL_EVENT_ACK. - -\family default -The first four of these reflected the completion of potentially long operations. - We have introduced new event types to reflect the fact that long operations - have a distinct starting point and a distinct completion point. - Moreover, the completion may be successful or unsuccessful. -\layout Standard - -In addition to providing a mechanism for reporting failure to higher levels - of software, this split provides an opportunity for for improved ordering - semantics. - Previously, if one process intiated two operations (e.g., two put operations) - on a remote process, these operations were guaranteed to complete in the - same order that they were initiated. - Now, we only guarantee that the initiation events are delivered in the - same order. - In particular, the operations do not need to complete in the order that - they were intiated. -\layout Subsection* - -Well known proces ids (Section -\begin_inset LatexCommand \ref{sec:niinit} - -\end_inset - -) -\layout Standard - -To support the notion of -\begin_inset Quotes eld -\end_inset - -well known process ids, -\begin_inset Quotes erd -\end_inset - - we added a process id argument to the arguments for PtlNIInit. -\layout Chapter* - -Glossary -\layout Description - -API Application Programming Interface. - A definition of the functions and semantics provided by library of functions. - -\layout Description - -Initiator A -\emph on -process -\emph default - that initiates a message operation. - -\layout Description - -Message An application-defined unit of data that is exchanged between -\emph on -processes -\emph default -. - -\layout Description - -Message\SpecialChar ~ -Operation Either a put operation, which writes data, or a get operation, - which reads data. - -\layout Description - -Network A network provides point-to-point communication between -\emph on -nodes -\emph default -. - Internally, a network may provide multiple routes between endpoints (to - improve fault tolerance or to improve performance characteristics); however, - multiple paths will not be exposed outside of the network. - -\layout Description - -Node A node is an endpoint in a -\emph on -network -\emph default -. - Nodes provide processing capabilities and memory. - A node may provide multiple processors (an SMP node) or it may act as a - -\emph on -gateway -\emph default - between networks. - -\layout Description - -Process A context of execution. - A process defines a virtual memory (VM) context. - This context is not shared with other processes. - Several threads may share the VM context defined by a process. - -\layout Description - -Target A -\emph on -process -\emph default - that is acted upon by a message operation. - -\layout Description - -Thread A context of execution that shares a VM context with other threads. - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -cleardoublepage -\layout Standard - -\backslash -setcounter{page}{1} -\backslash -pagenumbering{arabic} -\end_inset - - -\layout Chapter - -Introduction -\begin_inset LatexCommand \label{sec:intro} - -\end_inset - - -\layout Section - -Overview -\layout Standard - -This document describes an application programming interface for message - passing between nodes in a system area network. - The goal of this interface is to improve the scalability and performance - of network communication by defining the functions and semantics of message - passing required for scaling a parallel computing system to ten thousand - nodes. - This goal is achieved by providing an interface that will allow a quality - implementation to take advantage of the inherently scalable design of Portals. -\layout Standard - -This document is divided into several sections: -\layout Description - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:intro} - -\end_inset - ----Introduction This section describes the purpose and scope of the Portals - API. - -\layout Description - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:apiover} - -\end_inset - ----An\SpecialChar ~ -Overview\SpecialChar ~ -of\SpecialChar ~ -the\SpecialChar ~ -Portals\SpecialChar ~ -3.1\SpecialChar ~ -API This section gives a brief overview of the - Portals API. - The goal is to introduce the key concepts and terminology used in the descripti -on of the API. - -\layout Description - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:api} - -\end_inset - ----The\SpecialChar ~ -Portals\SpecialChar ~ -3.2\SpecialChar ~ -API This section describes the functions and semantics of - the Portals application programming interface. - -\layout Description - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:semantics} - -\end_inset - ---The\SpecialChar ~ -Semantics\SpecialChar ~ -of\SpecialChar ~ -Message\SpecialChar ~ -Transmission This section describes the semantics - of message transmission. - In particular, the information transmitted in each type of message and - the processing of incoming messages. - -\layout Description - -Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:examples} - -\end_inset - ----Examples This section presents several examples intended to illustrates - the use of the Portals API. - -\layout Section - -Purpose -\layout Standard - -Existing message passing technologies available for commodity cluster networking - hardware do not meet the scalability goals required by the Cplant\SpecialChar ~ - -\begin_inset LatexCommand \cite{Cplant} - -\end_inset - - project at Sandia National Laboratories. - The goal of the Cplant project is to construct a commodity cluster that - can scale to the order of ten thousand nodes. - This number greatly exceeds the capacity for which existing message passing - technologies have been designed and implemented. -\layout Standard - -In addition to the scalability requirements of the network, these technologies - must also be able to support a scalable implementation of the Message Passing - Interface (MPI)\SpecialChar ~ - -\begin_inset LatexCommand \cite{MPIstandard} - -\end_inset - - standard, which has become the -\shape italic -de facto -\shape default - standard for parallel scientific computing. - While MPI does not impose any scalability limitations, existing message - passing technologies do not provide the functionality needed to allow implement -ations of MPI to meet the scalability requirements of Cplant. -\layout Standard - -The following are properties of a network architecture that do not impose - any inherent scalability limitations: -\layout Itemize - -Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~ - -\begin_inset LatexCommand \cite{VIA} - -\end_inset - - and TCP/IP sockets, have limitations on the number of peer connections - that can be established. - -\layout Itemize - -Network independence - Many communication systems depend on the host processor - to perform operations in order for messages in the network to be consumed. - Message consumption from the network should not be dependent on host processor - activity, such as the operating system scheduler or user-level thread scheduler. - -\layout Itemize - -User-level flow control - Many communication systems manage flow control - internally to avoid depleting resources, which can significantly impact - performance as the number of communicating processes increases. - -\layout Itemize - -OS Bypass - High performance network communication should not involve memory - copies into or out of a kernel-managed protocol stack. - -\layout Standard - -The following are properties of a network architecture that do not impose - scalability limitations for an implementation of MPI: -\layout Itemize - -Receiver-managed - Sender-managed message passing implementations require - a persistent block of memory to be available for every process, requiring - memory resources to increase with job size and requiring user-level flow - control mechanisms to manage these resources. - -\layout Itemize - -User-level Bypass - While OS Bypass is necessary for high-performance, it - alone is not sufficient to support the Progress Rule of MPI asynchronous - operations. - -\layout Itemize - -Unexpected messages - Few communication systems have support for receiving - messages for which there is no prior notification. - Support for these types of messages is necessary to avoid flow control - and protocol overhead. - -\layout Section - -Background -\layout Standard - -Portals was originally designed for and implemented on the nCube machine - as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~ - -\begin_inset LatexCommand \cite{SUNMOS} - -\end_inset - - and Puma\SpecialChar ~ - -\begin_inset LatexCommand \cite{PumaOS} - -\end_inset - - lightweight kernel development projects. - Portals went through two design phases, the latter of which is used on - the 4500-node Intel TeraFLOPS machine\SpecialChar ~ - -\begin_inset LatexCommand \cite{TFLOPS} - -\end_inset - -. - Portals have been very successful in meeting the needs of such a large - machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~ - -\begin_inset LatexCommand \cite{PumaMPI} - -\end_inset - -, but also for implementing the scalable run-time environment and parallel - I/O capabilities of the machine. -\layout Standard - -The second generation Portals implementation was designed to take full advantage - of the hardware architecture of large MPP machines. - However, efforts to implement this same design on commodity cluster technology - identified several limitations, due to the differences in network hardware - as well as to shortcomings in the design of Portals. -\layout Section - -Scalability -\layout Standard - -The primary goal in the design of Portals is scalability. - Portals are designed specifically for an implementation capable of supporting - a parallel job running on tens of thousands of nodes. - Performance is critical only in terms of scalability. - That is, the level of message passing performance is characterized by how - far it allows an application to scale and not by how it performs in micro-bench -marks (e.g., a two node bandwidth or latency test). -\layout Standard - -The Portals API is designed to allow for scalability, not to guarantee it. - Portals cannot overcome the shortcomings of a poorly designed application - program. - Applications that have inherent scalability limitations, either through - design or implementation, will not be transformed by Portals into scalable - applications. - Scalability must be addressed at all levels. - Portals do not inhibit scalability, but do not guarantee it either. -\layout Standard - -To support scalability, the Portals interface maintains a minimal amount - of state. - Portals provide reliable, ordered delivery of messages between pairs of - processes. - They are connectionless: a process is not required to explicitly establish - a point-to-point connection with another process in order to communicate. - Moreover, all buffers used in the transmission of messages are maintained - in user space. - The target process determines how to respond to incoming messages, and - messages for which there are no buffers are discarded. -\layout Section - -Communication Model -\layout Standard - -Portals combine the characteristics of both one-side and two-sided communication. - They define a -\begin_inset Quotes eld -\end_inset - -matching put -\begin_inset Quotes erd -\end_inset - - operation and a -\begin_inset Quotes eld -\end_inset - -matching get -\begin_inset Quotes erd -\end_inset - - operation. - The destination of a put (or send) is not an explicit address; instead, - each message contains a set of match bits that allow the receiver to determine - where incoming messages should be placed. - This flexibility allows Portals to support both traditional one-sided operation -s and two-sided send/receive operations. -\layout Standard - -Portals allows the target to determine whether incoming messages are acceptable. - A target process can choose to accept message operations from any specific - process or can choose to ignore message operations from any specific process. -\layout Section - -Zero Copy, OS Bypass and Application Bypass -\layout Standard - -In traditional system architectures, network packets arrive at the network - interface card (NIC), are passed through one or more protocol layers in - the operating system, and eventually copied into the address space of the - application. - As network bandwidth began to approach memory copy rates, reduction of - memory copies became a critical concern. - This concern lead to the development of zero-copy message passing protocols - in which message copies are eliminated or pipelined to avoid the loss of - bandwidth. -\layout Standard - -A typical zero-copy protocol has the NIC generate an interrupt for the CPU - when a message arrives from the network. - The interrupt handler then controls the transfer of the incoming message - into the address space of the appropriate application. - The interrupt latency, the time from the initiation of an interrupt until - the interrupt handler is running, is fairly significant. - To avoid this cost, some modern NICs have processors that can be programmed - to implement part of a message passing protocol. - Given a properly designed protocol, it is possible to program the NIC to - control the transfer of incoming messages, without needing to interrupt - the CPU. - Because this strategy does not need to involve the OS on every message - transfer, it is frequently called -\begin_inset Quotes eld -\end_inset - -OS Bypass. -\begin_inset Quotes erd -\end_inset - - ST\SpecialChar ~ - -\begin_inset LatexCommand \cite{ST} - -\end_inset - -, VIA\SpecialChar ~ - -\begin_inset LatexCommand \cite{VIA} - -\end_inset - -, FM\SpecialChar ~ - -\begin_inset LatexCommand \cite{FM2} - -\end_inset - -, GM\SpecialChar ~ - -\begin_inset LatexCommand \cite{GM} - -\end_inset - -, and Portals are examples of OS Bypass protocols. -\layout Standard - -Many protocols that support OS Bypass still require that the application - actively participate in the protocol to ensure progress. - As an example, the long message protocol of PM requires that the application - receive and reply to a request to put or get a long message. - This complicates the runtime environment, requiring a thread to process - incoming requests, and significantly increases the latency required to - initiate a long message protocol. - The Portals message passing protocol does not require activity on the part - of the application to ensure progress. - We use the term -\begin_inset Quotes eld -\end_inset - -Application Bypass -\begin_inset Quotes erd -\end_inset - - to refer to this aspect of the Portals protocol. -\layout Section - -Faults -\layout Standard - -Given the number of components that we are dealing with and the fact that - we are interested in supporting applications that run for very long times, - failures are inevitable. - The Portals API recognizes that the underlying transport may not be able - to successfully complete an operation once it has been initiated. - This is reflected in the fact that the Portals API reports three types - of events: events indicating the initiation of an operation, events indicating - the successful completion of an operation, and events indicating the unsuccessf -ul completion of an operation. - Every initiation event is eventually followed by a successful completion - event or an unsuccessful completion event. -\layout Standard - -Between the time an operation is started and the time that the operation - completes (successfully or unsuccessfully), any memory associated with - the operation should be considered volatile. - That is, the memory may be changed in unpredictable ways while the operation - is progressing. - Once the operation completes, the memory associated with the operation - will not be subject to further modification (from this operation). - Notice that unsuccessful operations may alter memory in an essentially - unpredictable fashion. -\layout Chapter - -An Overview of the Portals API -\begin_inset LatexCommand \label{sec:apiover} - -\end_inset - - -\layout Standard - -In this section, we give a conceptual overview of the Portals API. - The goal is to provide a context for understanding the detailed description - of the API presented in the next section. -\layout Section - -Data Movement -\begin_inset LatexCommand \label{sec:dmsemantics} - -\end_inset - - -\layout Standard - -A Portal represents an opening in the address space of a process. - Other processes can use a Portal to read (get) or write (put) the memory - associated with the portal. - Every data movement operation involves two processes, the -\series bold -initiator -\series default - and the -\series bold -target -\series default -. - The initiator is the process that initiates the data movement operation. - The target is the process that responds to the operation by either accepting - the data for a put operation, or replying with the data for a get operation. -\layout Standard - -In this discussion, activities attributed to a process may refer to activities - that are actually performed by the process or -\emph on -on behalf of the process -\emph default -. - The inclusiveness of our terminology is important in the context of -\emph on -application bypass -\emph default -. - In particular, when we note that the target sends a reply in the case of - a get operation, it is possible that reply will be generated by another - component in the system, bypassing the application. -\layout Standard - -Figures\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:put} - -\end_inset - - and -\begin_inset LatexCommand \ref{fig:get} - -\end_inset - - present graphical interpretations of the Portal data movement operations: - put and get. - In the case of a put operation, the initiator sends a put request message - containing the data to the target. - The target translates the Portal addressing information in the request - using its local Portal structures. - When the request has been processed, the target optionally sends an acknowledge -ment message. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename put.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 218pt - lyxheight 119pt -\end_inset - - -\layout Caption - -Portal Put (Send) -\begin_inset LatexCommand \label{fig:put} - -\end_inset - - -\end_inset - - -\layout Standard - -In the case of a get operation, the initiator sends a get request to the - target. - As with the put operation, the target translates the Portal addressing - information in the request using its local Portal structures. - Once it has translated the Portal addressing information, the target sends - a reply that includes the requested data. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename get.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 218pt - lyxheight 119pt -\end_inset - - -\layout Caption - -Portal Get -\begin_inset LatexCommand \label{fig:get} - -\end_inset - - -\end_inset - - -\layout Standard - -We should note that Portal address translations are only performed on nodes - that respond to operations initiated by other nodes. - Acknowledgements and replies to get operations bypass the portals address - translation structures. -\layout Section - -Portal Addressing -\begin_inset LatexCommand \label{subsec:paddress} - -\end_inset - - -\layout Standard - -One-sided data movement models (e.g., shmem\SpecialChar ~ - -\begin_inset LatexCommand \cite{CraySHMEM} - -\end_inset - -, ST\SpecialChar ~ - -\begin_inset LatexCommand \cite{ST} - -\end_inset - -, MPI-2\SpecialChar ~ - -\begin_inset LatexCommand \cite{MPI2} - -\end_inset - -) typically use a triple to address memory on a remote node. - This triple consists of a process id, memory buffer id, and offset. - The process id identifies the target process, the memory buffer id specifies - the region of memory to be used for the operation, and the offset specifies - an offset within the memory buffer. -\layout Standard - -In addition to the standard address components (process id, memory buffer - id, and offset), a Portal address includes a set of match bits. - This addressing model is appropriate for supporting one-sided operations - as well as traditional two-sided message passing operations. - Specifically, the Portals API provides the flexibility needed for an efficient - implementation of MPI-1, which defines two-sided operations with one-sided - completion semantics. -\layout Standard - -Figure\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:portals} - -\end_inset - - presents a graphical representation of the structures used by a target - in the interpretation of a Portal address. - The process id is used to route the message to the appropriate node and - is not reflected in this diagram. - The memory buffer id, called the -\series bold -portal id -\series default -, is used as an index into the Portal table. - Each element of the Portal table identifies a match list. - Each element of the match list specifies two bit patterns: a set of -\begin_inset Quotes eld -\end_inset - -don't care -\begin_inset Quotes erd -\end_inset - - bits, and a set of -\begin_inset Quotes eld -\end_inset - -must match -\begin_inset Quotes erd -\end_inset - - bits. - In addition to the two sets of match bits, each match list element has - at most one memory descriptor. - Each memory descriptor identifies a memory region and an optional event - queue. - The memory region specifies the memory to be used in the operation and - the event queue is used to record information about these operations. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename portals.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 305pt - lyxheight 106pt -\end_inset - - -\layout Caption - -Portal Addressing Structures -\begin_inset LatexCommand \label{fig:portals} - -\end_inset - - -\end_inset - - -\layout Standard - -Figure\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:flow} - -\end_inset - - illustrates the steps involved in translating a Portal address, starting - from the first element in a match list. - If the match criteria specified in the match list entry are met and the - memory descriptor list accepts the operation -\begin_inset Foot -collapsed true - -\layout Standard - -Memory descriptors can reject operations because a threshold has been exceeded - or because the memory region does not have sufficient space, see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - - -\end_inset - -, the operation (put or get) is performed using the memory region specified - in the memory descriptor. - If the memory descriptor specifies that it is to be unlinked when a threshold - has been exceeded, the match list entry is removed from the match list - and the resources associated with the memory descriptor and match list - entry are reclaimed. - Finally, if there is an event queue specified in the memory descriptor, - the operation is logged in the event queue. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename flow_new.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 447pt - lyxheight 282pt -\end_inset - - -\layout Caption - -Portals Address Translation -\begin_inset LatexCommand \label{fig:flow} - -\end_inset - - -\end_inset - - -\layout Standard - -If the match criteria specified in the match list entry are not met, or - there is no memory descriptor associated with the match list entry, or - the memory descriptor associated with the match list entry rejects the - operation, the address translation continues with the next match list entry. - If the end of the match list has been reached, the address translation - is aborted and the incoming requested is discarded. -\layout Section - -Access Control -\layout Standard - -A process can control access to its portals using an access control list. - Each entry in the access control list specifies a process id and a Portal - table index. - The access control list is actually an array of entries. - Each incoming request includes an index into the access control list (i.e., - a -\begin_inset Quotes eld -\end_inset - -cookie -\begin_inset Quotes erd -\end_inset - - or hint). - If the id of the process issuing the request doesn't match the id specified - in the access control list entry or the Portal table index specified in - the request doesn't match the Portal table index specified in the access - control list entry, the request is rejected. - Process identifiers and Portal table indexes may include wild card values - to increase the flexibility of this mechanism. - -\layout Standard - -Two aspects of this design merit further discussion. - First, the model assumes that the information in a message header, the - sender's id in particular, is trustworthy. - In most contexts, we assume that the entity that constructs the header - is trustworthy; however, using cryptographic techniques, we could easily - devise a protocol that would ensure the authenticity of the sender. -\layout Standard - -Second, because the access check is performed by the receiver, it is possible - that a malicious process will generate thousands of messages that will - be denied by the receiver. - This could saturate the network and/or the receiver, resulting in a -\emph on -denial of service -\emph default - attack. - Moving the check to the sender using capabilities, would remove the potential - for this form of attack. - However, the solution introduces the complexities of capability management - (exchange of capabilities, revocation, protections, etc). -\layout Section - -Multi-threaded Applications -\begin_inset LatexCommand \label{sec:threads} - -\end_inset - - -\layout Standard - -The Portals API supports a generic view of multi-threaded applications. - From the perspective of the Portals API, an application program is defined - by a set of processes. - Each process defines a unique address space. - The Portals API defines access to this address space from other processes - (using portals addressing and the data movement operations). - A process may have one or more -\emph on -threads -\emph default - executing in its address space. - -\layout Standard - -With the exception of -\emph on -PtlEQWait -\emph default - every function in the Portals API is non-blocking and atomic with respect - to both other threads and external operations that result from data movement - operations. - While individual operations are atomic, sequences of these operations may - be interleaved between different threads and with external operations. - The Portals API does not provide any mechanisms to control this interleaving. - It is expected that these mechanisms will be provided by the API used to - create threads. -\layout Chapter - -The Portals API -\begin_inset LatexCommand \label{sec:api} - -\end_inset - - -\layout Section - -Naming Conventions -\begin_inset LatexCommand \label{sec:conv} - -\end_inset - - -\layout Standard - -The Portals API defines two types of entities: functions and types. - Function always start with -\emph on -Ptl -\emph default - and use mixed upper and lower case. - When used in the body of this report, function names appear in italic face, - e.g., -\emph on -PtlInit -\emph default -. - The functions associated with an object type will have names that start - with -\emph on -Ptl -\emph default -, followed by the two letter object type code shown in Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:objcodes} - -\end_inset - -. - As an example, the function -\emph on -PtlEQAlloc -\emph default - allocates resources for an event queue. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Object Type Codes -\begin_inset LatexCommand \label{tab:objcodes} - -\end_inset - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\newline - -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\emph on -xx -\end_inset - - -\begin_inset Text - -\layout Standard - - Name -\end_inset - - -\begin_inset Text - -\layout Standard - - Section -\end_inset - - - - -\begin_inset Text - -\layout Standard - -EQ -\end_inset - - -\begin_inset Text - -\layout Standard - - Event Queue -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - MD -\end_inset - - -\begin_inset Text - -\layout Standard - - Memory Descriptor -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - ME -\end_inset - - -\begin_inset Text - -\layout Standard - - Match list Entry -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - NI -\end_inset - - -\begin_inset Text - -\layout Standard - - Network Interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Type names use lower case with underscores to separate words. - Each type name starts with -\family typewriter -ptl -\family default -_ and ends with -\family typewriter -_t -\family default -. - When used in the body of this report, type names appear in a fixed font, - e.g., -\family typewriter -ptl_match_bits_t -\family default -. -\layout Standard - -Names for constants use upper case with underscores to separate words. - Each constant name starts with -\family typewriter -PTL_ -\family default -. - When used in the body of this report, type names appear in a fixed font, - e.g., -\family typewriter -PTL_OK -\family default -. -\layout Section - -Base Types -\layout Standard - -The Portals API defines a variety of base types. - These types represent a simple renaming of the base types provided by the - C programming language. - In most cases these new type names have been introduced to improve type - safety and to avoid issues arising from differences in representation sizes - (e.g., 16-bit or 32-bit integers). -\layout Subsection - -Sizes -\begin_inset LatexCommand \label{sec:size-t} - -\end_inset - - -\layout Standard - -The type -\family typewriter -ptl_size_t -\family default - is an unsigned 64-bit integral type used for representing sizes. -\layout Subsection - -Handles -\begin_inset LatexCommand \label{sec:handle-type} - -\end_inset - - -\layout Standard - -Objects maintained by the API are accessed through handles. - Handle types have names of the form -\family typewriter -ptl_handle_ -\emph on -xx -\emph default -_t -\family default -, where -\emph on -xx -\emph default - is one of the two letter object type codes shown in Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:objcodes} - -\end_inset - -. - For example, the type -\family typewriter -ptl_handle_ni_t -\family default - is used for network interface handles. -\layout Standard - -Each type of object is given a unique handle type to enhance type checking. - The type, -\family typewriter -ptl_handle_any_t -\family default -, can be used when a generic handle is needed. - Every handle value can be converted into a value of type -\family typewriter -ptl_handle_any_t -\family default - without loss of information. -\layout Standard - -Handles are not simple values. - Every portals object is associated with a specific network interface and - an identifier for this interface (along with an object identifier) is part - of the handle for the object. -\layout Standard - -The special value -\family typewriter -PTL_EQ_NONE -\family default -, of type -\family typewriter -ptl_handle_eq_t -\family default -, is used to indicate the absence of an event queue. - See sections -\begin_inset LatexCommand \ref{sec:mdfree} - -\end_inset - - and\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:mdupdate} - -\end_inset - - for uses of this value. -\layout Subsection - -Indexes -\begin_inset LatexCommand \label{sec:index-type} - -\end_inset - - -\layout Standard - -The types -\family typewriter -ptl_pt_index_t -\family default - and -\family typewriter -ptl_ac_index_t -\family default - are integral types used for representing Portal table indexes and access - control tables indexes, respectively. - See section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:niinit} - -\end_inset - - for limits on values of these types. -\layout Subsection - -Match Bits -\begin_inset LatexCommand \label{sec:mb-type} - -\end_inset - - -\layout Standard - -The type -\family typewriter -ptl_match_bits_t -\family default - is capable of holding unsigned 64-bit integer values. -\layout Subsection - -Network Interfaces -\begin_inset LatexCommand \label{sec:ni-type} - -\end_inset - - -\layout Standard - -The type -\family typewriter -ptl_interface_t -\family default - is an integral type used for identifying different network interfaces. - Users will need to consult the local documentation to determine appropriate - values for the interfaces available. - The special value -\family typewriter -PTL_IFACE_DEFAULT -\family default - identifies the default interface. -\layout Subsection - -Identifiers -\begin_inset LatexCommand \label{sec:id-type} - -\end_inset - - -\layout Standard - -The type -\family typewriter -ptl_nid_t -\family default - is an integral type used for representing node ids -\family typewriter -, ptl_pid_t -\family default - is an integral type for representing process ids, and -\family typewriter -ptl_uid_t -\family default -is an integral type for representing user ids. -\layout Standard - -The special values -\family typewriter -PTL_PID_ANY -\family default - matches any process identifier, PTL_NID_ANY matches any node identifier, - and -\family typewriter -PTL_UID_ANY -\family default - matches any user identifier. - See sections -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - - and\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:acentry} - -\end_inset - - for uses of these values. -\layout Subsection - -Status Registers -\begin_inset LatexCommand \label{sec:stat-type} - -\end_inset - - -\layout Standard - -Each network interface maintains an array of status registers that can be - accessed using the -\family typewriter -PtlNIStatus -\family default - function (see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:nistatus} - -\end_inset - -). - The type -\family typewriter -ptl_sr_index_t -\family default - defines the types of indexes that can be used to access the status registers. - The only index defined for all implementations is -\family typewriter -PTL_SR_DROP_COUNT -\family default - which identifies the status register that counts the dropped requests for - the interface. - Other indexes (and registers) may be defined by the implementation. -\layout Standard - -The type -\family typewriter -ptl_sr_value_t -\family default - defines the types of values held in status registers. - This is a signed integer type. - The size is implementation dependent, but must be at least 32 bits. -\layout Section - -Initialization and Cleanup -\begin_inset LatexCommand \label{sec:init} - -\end_inset - - -\layout Standard - -The Portals API includes a function, -\emph on -PtlInit -\emph default -, to initialize the library and a function, -\emph on -PtlFini -\emph default -, to cleanup after the application is done using the library. -\layout Subsection - -PtlInit -\begin_inset LatexCommand \label{sec:ptlinit} - -\end_inset - - -\layout LyX-Code - -int PtlInit( int *max_interfaces ); -\layout Standard -\noindent -The -\emph on -PtlInit -\emph default - function initializes the Portals library. - PtlInit must be called at least once by a process before any thread makes - a Portals function call, but may be safely called more than once. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_FAIL Indicates an error during initialization. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -max_interfaces -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -max_interfaces -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the maximum number of interfaces - that can be initialized. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlFini -\begin_inset LatexCommand \label{sec:ptlfini} - -\end_inset - - -\layout LyX-Code - -void PtlFini( void ); -\layout Standard -\noindent -The -\emph on -PtlFini -\emph default - function cleans up after the Portals library is no longer needed by a process. - After this function is called, calls to any of the functions defined by - the Portal API or use of the structures set up by the Portals API will - result in undefined behavior. - This function should be called once and only once during termination by - a process. - Typically, this function will be called in the exit sequence of a process. - Individual threads should not call PtlFini when they terminate. -\layout Section - -Network Interfaces -\begin_inset LatexCommand \label{sec:ni} - -\end_inset - - -\layout Standard - -The Portals API supports the use of multiple network interfaces. - However, each interface is treated as an independent entity. - Combining interfaces (e.g., -\begin_inset Quotes eld -\end_inset - -bonding -\begin_inset Quotes erd -\end_inset - - to create a higher bandwidth connection) must be implemented by the application - or embedded in the underlying network. - Interfaces are treated as independent entities to make it easier to cache - information on individual network interface cards. -\layout Standard - -Once initialized, each interface provides a Portal table, an access control - table, and a collection of status registers. - See Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - for a discussion of updating Portal table entries using the -\emph on -PtlMEAttach -\emph default - function. - See Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ac} - -\end_inset - - for a discussion of the initialization and updating of entries in the access - control table. - See Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:nistatus} - -\end_inset - - for a discussion of the -\emph on -PtlNIStatus -\emph default - function which can be used to determine the value of a status register. -\layout Standard - -Every other type of Portal object (e.g., memory descriptor, event queue, or - match list entry) is associated with a specific network interface. - The association to a network interface is established when the object is - created and is encoded in the handle for the object. -\layout Standard - -Each network interface is initialized and shutdown independently. - The initialization routine, -\emph on -PtlNIInit -\emph default -, returns a handle for an interface object which is used in all subsequent - Portal operations. - The -\emph on -PtlNIFini -\emph default - function is used to shutdown an interface and release any resources that - are associated with the interface. - Network interface handles are associated with processes, not threads. - All threads in a process share all of the network interface handles. -\layout Standard - -The Portals API also defines the -\emph on -PtlNIStatus -\emph default - function to query the status registers for a network interface, the -\emph on -PtlNIDist -\emph default - function to determine the -\begin_inset Quotes eld -\end_inset - -distance -\begin_inset Quotes erd -\end_inset - - to another process, and the -\emph on -PtlNIHandle -\emph default - function to determine the network interface that an object is associated - with. -\layout Subsection - -PtlNIInit -\begin_inset LatexCommand \label{sec:niinit} - -\end_inset - - -\layout LyX-Code - -typedef struct { -\newline - int max_match_entries; -\newline - int max_mem_descriptors; -\newline - int max_event_queues; -\newline - ptl_ac_index_t max_atable_index; -\newline - ptl_pt_index_t max_ptable_index; -\newline -} ptl_ni_limits_t; -\newline - -\newline -int PtlNIInit( ptl_interface_t interface -\newline - ptl_pid_t pid, -\newline - ptl_ni_limits_t* desired, -\newline - ptl_ni_limits_t* actual, -\newline - ptl_handle_ni_t* handle ); -\layout Standard - -Values of type -\family typewriter -ptl_ni_limits_t -\family default - include the following members: -\layout Description - -max_match_entries Maximum number of match entries that can be allocated - at any one time. -\layout Description - -max_mem_descriptors Maximum number of memory descriptors that can be allocated - at any one time. -\layout Description - -max_event_queues Maximum number of event queues that can be allocated at - any one time. -\layout Description - -max_atable_index Largest access control table index for this interface, - valid indexes range from zero to -\family typewriter -max_atable_index -\family default -, inclusive. -\layout Description - -max_ptable_index Largest Portal table index for this interface, valid indexes - range from zero to -\family typewriter -max_ptable_index -\family default -, inclusive. -\layout Standard -\noindent -The -\emph on -PtlNIInit -\emph default - function is used to initialized the Portals API for a network interface. - This function must be called at least once by each process before any other - operations that apply to the interface by any process or thread. - For subsequent calls to -\shape italic -PtlNIInit -\shape default - from within the same process (either by different threads or the same thread), - the desired limits will be ignored and the call will return the existing - NI handle. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INIT_DUP Indicates a duplicate initialization of -\family typewriter -interface -\family default -. - -\layout Description - -PTL_INIT_INV Indicates that -\family typewriter -interface -\family default - is not a valid network interface. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to initialize the - interface. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -pid -\family default - is not a valid process id. -\layout Description - -PTL_SEGV Indicates that -\family typewriter -actual -\family default -or -\family typewriter - handle -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the network interface to be initialized. - (See section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ni-type} - -\end_inset - - for a discussion of values used to identify network interfaces.) -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -pid -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the desired process id (for well known process ids). - The value -\family typewriter -PTL_PID_ANY -\family default - may be used to have the process id assigned by the underlying library. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -desired -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -If non-NULL, points to a structure that holds the desired limits. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -actual -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, the location pointed to by actual will hold the actual - limits. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the interface. -\end_inset - - - - -\end_inset - - -\layout Comment - -The use of desired is implementation dependent. - In particular, an implementation may choose to ignore this argument. -\layout Subsection - -PtlNIFini -\begin_inset LatexCommand \label{sec:nifini} - -\end_inset - - -\layout LyX-Code - -int PtlNIFini( ptl_handle_ni_t interface ); -\layout Standard -\noindent -The -\emph on -PtlNIFini -\emph default - function is used to release the resources allocated for a network interface. - Once the -\emph on -PtlNIFini -\emph default - operation has been started, the results of pending API operations (e.g., - operations initiated by another thread) for this interface are undefined. - Similarly, the effects of incoming operations (puts and gets) or return - values (acknowledgements and replies) for this interface are undefined. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard - -A handle for the interface to shutdown. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlNIStatus -\begin_inset LatexCommand \label{sec:nistatus} - -\end_inset - - -\layout LyX-Code - -int PtlNIStatus( ptl_handle_ni_t interface, -\newline - ptl_sr_index_t status_register, -\newline - ptl_sr_value_t* status ); -\layout Standard -\noindent -The -\emph on -PtlNIStatus -\emph default - function returns the value of a status register for the specified interface. - (See section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:stat-type} - -\end_inset - - for more information on status register indexes and status register values.) -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_INV_SR_INDX Indicates that -\family typewriter -status_register -\family default - is not a valid status register. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -status -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the interface to use. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -status_register -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -An index for the status register to read. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -status -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the current value of the status - register. -\end_inset - - - - -\end_inset - - -\layout Comment - -The only status register that must be defined is a drop count register ( -\family typewriter -PTL_SR_DROP_COUNT -\family default -). - Implementations may define additional status registers. - Identifiers for the indexes associated with these registers should start - with the prefix -\family typewriter -PTL_SR_ -\family default -. -\layout Subsection - -PtlNIDist -\layout LyX-Code - -int PtlNIDist( ptl_handle_ni_t interface, -\newline - ptl_process_id_t process, -\newline - unsigned long* distance ); -\layout Standard -\noindent -The -\emph on -PtlNIDist -\emph default - function returns the distance to another process using the specified interface. - Distances are only defined relative to an interface. - Distance comparisons between different interfaces on the same process may - be meaningless. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -process -\family default - is not a valid process identifier. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -distance -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the interface to use. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -process -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -An identifier for the process whose distance is being requested. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -distance -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the distance to the remote - process. -\end_inset - - - - -\end_inset - - -\layout Comment - -This function should return a static measure of distance. - Examples include minimum latency, the inverse of available bandwidth, or - the number of switches between the two endpoints. -\layout Subsection - -PtlNIHandle -\layout LyX-Code - -int PtlNIHandle( ptl_handle_any_t handle, -\newline - ptl_handle_ni_t* interface ); -\layout Standard -\noindent -The -\emph on -PtlNIHandle -\emph default - function returns a handle for the network interface with which the object - identified by -\family typewriter -handle -\family default - is associated. - If the object identified by -\family typewriter -handle -\family default - is a network interface, this function returns the same value it is passed. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_HANDLE Indicates that -\family typewriter -handle -\family default - is not a valid handle. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -interface -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the object. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the network interface - associated with -\family typewriter -handle -\family default -. -\end_inset - - - - -\end_inset - - -\layout Comment - -Every handle should encode the network interface and the object id relative - to this handle. - Both are presumably encoded using integer values. -\layout Section - -User Identification -\begin_inset LatexCommand \label{sec:uid} - -\end_inset - - -\layout Standard - -Every process runs on behalf of a user. - -\layout Subsection - -PtlGetUid -\layout LyX-Code - -int PtlGetUid( ptl_handle_ni_t ni_handle, -\newline - ptl_uid_t* uid ); -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -ni_handle -\family default - is not a valid network interface handle. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -interface -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A network interface handle. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -id -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the user id for the calling - process. -\end_inset - - - - -\end_inset - - -\layout Comment - -Note that user identifiers are dependent on the network interface(s). - In particular, if a node has multiple interfaces, a process may have multiple - user identifiers. -\layout Section - -Process Identification -\begin_inset LatexCommand \label{sec:pid} - -\end_inset - - -\layout Standard - -Processes that use the Portals API, can be identified using a node id and - process id. - Every node accessible through a network interface has a unique node identifier - and every process running on a node has a unique process identifier. - As such, any process in the computing system can be identified by its node - id and process id. - -\layout Standard - -The Portals API defines a type, -\family typewriter -ptl_process_id_t -\family default - for representing process ids and a function, -\emph on -PtlGetId -\emph default -, which can be used to obtain the id of the current process. -\layout Comment - -The portals API does not include thread identifiers. - Messages are delivered to processes (address spaces) not threads (contexts - of execution). -\layout Subsection - -The Process Id Type -\begin_inset LatexCommand \label{sec:pid-type} - -\end_inset - - -\layout LyX-Code - -typedef struct { -\newline - ptl_nid_t nid; /* node id */ -\newline - ptl_pid_t pid; /* process id */ -\newline -} ptl_process_id_t; -\layout Standard -\noindent -The -\family typewriter -ptl_process_id_t -\family default - type uses two identifiers to represent a process id: a node id and a process - id. - -\layout Subsection - -PtlGetId -\begin_inset LatexCommand \label{sub:PtlGetId} - -\end_inset - - -\layout LyX-Code - -int PtlGetId( ptl_handle_ni_t ni_handle, -\newline - ptl_process_id_t* id ); -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -ni_handle -\family default - is not a valid network interface handle. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -id -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A network interface handle. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -id -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the id for the calling process. -\end_inset - - - - -\end_inset - - -\layout Comment - -Note that process identifiers are dependent on the network interface(s). - In particular, if a node has multiple interfaces, it may have multiple - node identifiers. -\layout Section - -Match List Entries and Match Lists -\begin_inset LatexCommand \label{sec:me} - -\end_inset - - -\layout Standard - -A match list is a chain of match list entries. - Each match list entry includes a memory descriptor and a set of match criteria. - The match criteria can be used to reject incoming requests based on process - id or the match bits provided in the request. - A match list is created using the -\emph on -PtlMEAttach -\emph default - or -\shape italic -PtlMEAttachAny -\shape default - functions, which create a match list consisting of a single match list - entry, attaches the match list to the specified Portal index, and returns - a handle for the match list entry. - Match entries can be dynamically inserted and removed from a match list - using the -\emph on -PtlMEInsert -\emph default - and -\emph on -PtlMEUnlink -\emph default - functions. -\layout Subsection - -PtlMEAttach -\begin_inset LatexCommand \label{sec:meattach} - -\end_inset - - -\layout LyX-Code - -typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t; -\newline - -\layout LyX-Code - -typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t; -\newline - -\layout LyX-Code - -int PtlMEAttach( ptl_handle_ni_t interface, -\newline - ptl_pt_index_t index, -\newline - ptl_process_id_t matchid, -\newline - ptl_match_bits_t match_bits, -\newline - ptl_match_bits_t ignorebits, -\newline - ptl_unlink_t unlink, -\newline - ptl_ins_pos_t position, -\newline - ptl_handle_me_t* handle ); -\layout Standard -\noindent -Values of the type -\family typewriter -ptl_ins_pos_t -\family default - are used to control where a new item is inserted. - The value -\family typewriter -PTL_INS_BEFORE -\family default - is used to insert the new item before the current item or before the head - of the list. - The value -\family typewriter -PTL_INS_AFTER -\family default - is used to insert the new item after the current item or after the last - item in the list. - -\layout Standard - -The -\emph on -PtlMEAttach -\emph default - function creates a match list consisting of a single entry and attaches - this list to the Portal table for -\family typewriter -interface -\family default -. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_PTINDEX Indicates that -\family typewriter -index -\family default - is not a valid Portal table index. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -matchid -\family default - is not a valid process identifier. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - match list entry. - -\layout Description - -PTL_ML_TOOLONG Indicates that the resulting match list is too long. - The maximum length for a match list is defined by the interface. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the interface to use. - -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The Portal table index where the match list should be attached. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -matchid -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Specifies the match criteria for the process id of the requestor. - The constants -\family typewriter -PTL_PID_ANY -\family default - and -\family typewriter -PTL_NID_ANY -\family default - can be used to wildcard either of the ids in the -\family typewriter -ptl_process_id_t -\family default - structure. - -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -match_bits, ignorebits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Specify the match criteria to apply to the match bits in the incoming request. - The -\family typewriter -ignorebits -\family default - are used to mask out insignificant bits in the incoming match bits. - The resulting bits are then compared to the match list entry's match - bits to determine if the incoming request meets the match criteria. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -unlink -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Indicates the match list entry should be unlinked when the last memory descripto -r associated with this match list entry is unlinked. - (Note, the check for unlinking a match entry only occurs when a memory - descriptor is unlinked.) -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -position -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Indicates whether the new match entry should be prepended or appended to - the existing match list. - If there is no existing list, this argument is ignored and the new match - entry becomes the only entry in the list. - Allowed constants: -\family typewriter -PTL_INS_BEFORE -\family default -, -\family typewriter -PTL_INS_AFTER -\family default -. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the newly created - match list entry. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMEAttachAny -\begin_inset LatexCommand \label{sec:attachany} - -\end_inset - - -\layout LyX-Code - -int PtlMEAttachAny( ptl_handle_ni_t interface, -\newline - ptl_pt_index_t *index, -\newline - ptl_process_id_t matchid, -\newline - ptl_match_bits_t match_bits, -\newline - ptl_match_bits_t ignorebits, -\newline - ptl_unlink_t unlink, -\newline - ptl_handle_me_t* handle ); -\layout Standard - -The -\emph on -PtlMEAttachAny -\emph default - function creates a match list consisting of a single entry and attaches - this list to an unused Portal table entry for -\family typewriter -interface -\family default -. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -matchid -\family default - is not a valid process identifier. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - match list entry. - -\layout Description - -PTL_PT_FULL Indicates that there are no free entries in the Portal table. -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the interface to use. - -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On succesfful return, this location will hold the Portal index where the - match list has been attached. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -matchid, match_bits, ignorebits, unlink -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -See the discussion for -\shape italic -PtlMEAttach -\shape default -. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the newly created - match list entry. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMEInsert -\begin_inset LatexCommand \label{sec:meinsert} - -\end_inset - - -\layout LyX-Code - -int PtlMEInsert( ptl_handle_me_t current, -\newline - ptl_process_id_t matchid, -\newline - ptl_match_bits_t match_bits, -\newline - ptl_match_bits_t ignorebits, -\newline - ptl_ins_pos_t position, -\newline - ptl_handle_me_t* handle ); -\layout Standard - -The -\emph on -PtlMEInsert -\emph default - function creates a new match list entry and inserts this entry into the - match list containing -\family typewriter -current -\family default -. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -matchid -\family default - is not a valid process identifier. - -\layout Description - -PTL_INV_ME Indicates that -\family typewriter -current -\family default - is not a valid match entry handle. - -\layout Description - -PTL_ML_TOOLONG Indicates that the resulting match list is too long. - The maximum length for a match list is defined by the interface. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - match entry. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -current -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for a match entry. - The new match entry will be inserted immediately before or immediately - after this match entry. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -matchid -\family default -, -\family typewriter -match_bits -\family default -, -\family typewriter -ignorebits -\family default -, -\family typewriter -unlink -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -See the discussion for -\emph on -PtlMEAttach -\emph default - -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -position -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Indicates whether the new match entry should be inserted before or after - the -\family typewriter -current -\family default - entry. - Allowed constants: -\family typewriter -PTL_INS_BEFORE -\family default -, -\family typewriter -PTL_INS_AFTER -\family default -. -\end_inset - - - - -\begin_inset Text - -\layout Standard -\noindent - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -See the discussion for -\emph on -PtlMEAttach -\emph default -. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMEUnlink -\begin_inset LatexCommand \label{sec:meunlink} - -\end_inset - - -\layout LyX-Code - -int PtlMEUnlink( ptl_handle_me_t entry ); -\layout Standard -\noindent -The -\emph on -PtlMEUnlink -\emph default - function can be used to unlink a match entry from a match list. - This operation also releases any resources associated with the match entry - (including the associated memory descriptor). - It is an error to use the match entry handle after calling -\emph on -PtlMEUnlink -\emph default -. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_ME Indicates that -\family typewriter -entry -\family default - is not a valid match entry handle. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -entry -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard - -A handle for the match entry to be unlinked. -\end_inset - - - - -\end_inset - - -\layout Section - -Memory Descriptors -\begin_inset LatexCommand \label{sec:md} - -\end_inset - - -\layout Standard - -A memory descriptor contains information about a region of an application - process' memory and an event queue where information about the operations - performed on the memory descriptor are recorded. - The Portals API provides two operations to create memory descriptors: -\emph on -PtlMDAttach -\emph default -, and -\emph on -PtlMDBind -\emph default -; an operation to update a memory descriptor, -\emph on -PtlMDUpdate -\emph default -; and an operation to unlink and release the resources associated with a - memory descriptor, -\emph on -PtlMDUnlink -\emph default -. -\layout Subsection - -The Memory Descriptor Type -\begin_inset LatexCommand \label{sec:md-type} - -\end_inset - - -\layout LyX-Code - -typedef struct { -\newline - void* start; -\newline - ptl_size_t length; -\newline - int threshold; -\newline - unsigned int max_offset; -\newline - unsigned int options; -\newline - void* user_ptr; -\newline - ptl_handle_eq_t eventq; -\newline -} ptl_md_t; -\layout Standard -\noindent -The -\family typewriter -ptl_md_t -\family default - type defines the application view of a memory descriptor. - Values of this type are used to initialize and update the memory descriptors. -\layout Subsubsection - -Members -\layout Description - -start,\SpecialChar ~ -length Specify the memory region associated with the memory descriptor. - The -\family typewriter -start -\family default - member specifies the starting address for the memory region and the -\family typewriter -length -\family default - member specifies the length of the region. - The -\family typewriter -start member -\family default - can be NULL provided that the -\family typewriter -length -\family default - member is zero. - (Zero length buffers are useful to record events.) There are no alignment - restrictions on the starting address or the length of the region; although, - unaligned messages may be slower (i.e., lower bandwidth and/or longer latency) - on some implementations. - -\layout Description - -threshold Specifies the maximum number of operations that can be performed - on the memory descriptor. - An operation is any action that could possibly generate an event (see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - for the different types of events). - In the usual case, the threshold value is decremented for each operation - on the memory descriptor. - When the threshold value is zero, the memory descriptor is -\emph on -inactive -\emph default -, and does not respond to operations. - A memory descriptor can have an initial threshold value of zero to allow - for manipulation of an inactive memory descriptor by the local process. - A threshold value of -\family typewriter -PTL_MD_THRESH_INF -\family default - indicates that there is no bound on the number of operations that may be - applied to a memory descriptor. - Note that local operations (e.g., -\emph on -PtlMDUpdate -\emph default -) are not applied to the threshold count. - -\layout Description - -max_offset Specifies the maximum local offset of a memory descriptor. - When the local offset of a memory descriptor exceeds this maximum, the - memory descriptor becomes -\shape italic -inactive -\shape default - and does not respond to further operations. -\layout Description - -options Specifies the behavior of the memory descriptor. - There are five options that can be selected: enable put operations (yes - or no), enable get operations (yes or no), offset management (local or - remote), message truncation (yes or no), and acknowledgement (yes or no). - Values for this argument can be constructed using a bitwise or of the following - values: -\begin_deeper -\begin_deeper -\layout Description - -PTL_MD_OP_PUT Specifies that the memory descriptor will respond to -\emph on -put -\emph default - operations. - By default, memory descriptors reject -\emph on -put -\emph default - operations. - -\layout Description - -PTL_MD_OP_GET Specifies that the memory descriptor will respond to -\emph on -get -\emph default - operations. - By default, memory descriptors reject -\emph on -get -\emph default - operations. - -\layout Description - -PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory - region is provided by the incoming request. - By default, the offset is maintained locally. - When the offset is maintained locally, the offset is incremented by the - length of the request so that the next operation (put and/or get) will - access the next part of the memory region. -\layout Description - -PTL_MD_TRUNCATE Specifies that the length provided in the incoming request - can be reduced to match the memory available in the region. - (The memory available in a memory region is determined by subtracting the - offset from the length of the memory region.) By default, if the length - in the incoming operation is greater than the amount of memory available, - the operation is rejected. - -\layout Description - -PTL_MD_ACK_DISABLE Specifies that an acknowledgement should -\emph on -not -\emph default - be sent for incoming -\emph on -put -\emph default - operations, even if requested. - By default, acknowledgements are sent for -\emph on -put -\emph default - operations that request an acknowledgement. - Acknowledgements are never sent for -\emph on -get -\emph default - operations. - The value sent in the reply serves as an implicit acknowledgement. - -\end_deeper -\layout Standard - - -\series bold -Note -\series default -: It is not considered an error to have a memory descriptor that does not - respond to either -\emph on -put -\emph default - or -\emph on -get -\emph default - operations: Every memory descriptor responds to -\emph on -reply -\emph default - operations. - Nor is it considered an error to have a memory descriptor that responds - to both -\emph on -put -\emph default - and -\emph on -get -\emph default - operations. - -\end_deeper -\layout Description - -user_ptr A user-specified value that is associated with the memory descriptor. - The value does not need to be a pointer, but must fit in the space used - by a pointer. - This value (along with other values) is recorded in events associated with - operations on this memory descriptor. -\begin_inset Foot -collapsed true - -\layout Standard - -Tying the memory descriptor to a user-defined value can be useful when multiple - memory descriptor share the same event queue or when the memory descriptor - needs to be associated with a data structure maintained by the application. - For example, an MPI implementation can set the -\family typewriter -user_ptr -\family default - argument to the value of an MPI Request. - This direct association allows for processing of memory descriptor's by - the MPI implementation without a table lookup or a search for the appropriate - MPI Request. -\end_inset - - -\layout Description - -eventq A handle for the event queue used to log the operations performed - on the memory region. - If this argument is -\family typewriter -PTl_EQ_NONE -\family default -, operations performed on this memory descriptor are not logged. - -\layout Subsection - -PtlMDAttach -\begin_inset LatexCommand \label{sec:mdattach} - -\end_inset - - -\layout LyX-Code - -int PtlMDAttach( ptl_handle_me_t match, -\newline - ptl_md_t mem_desc, -\newline - ptl_unlink_t unlink_op, -\newline - ptl_unlink_t unlink_nofit, -\newline - ptl_handle_md_t* handle ); -\layout Standard -\noindent -Values of the type -\family typewriter -ptl_unlink_t -\family default - are used to control whether an item is unlinked from a list. - The value -\family typewriter -PTL_UNLINK -\family default - enables unlinking. - The value -\family typewriter -PTL_RETAIN -\family default - disables unlinking. -\layout Standard - -The -\emph on -PtlMDAttach -\emph default - operation is used to create a memory descriptor and attach it to a match - list entry. - An error code is returned if this match list entry already has an associated - memory descriptor. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INUSE Indicates that -\family typewriter -match -\family default - already has a memory descriptor attached. - -\layout Description - -PTL_INV_ME Indicates that -\family typewriter -match -\family default - is not a valid match entry handle. - -\layout Description - -PTL_ILL_MD Indicates that -\family typewriter -mem_desc -\family default - is not a legal memory descriptor. - This may happen because the memory region defined in -\family typewriter -mem_desc -\family default - is invalid or because the network interface associated with the -\family typewriter -eventq -\family default - in -\family typewriter -mem_desc -\family default - is not the same as the network interface associated with -\family typewriter -match -\family default -. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - memory descriptor. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -match -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the match entry that the memory descriptor will be associated - with. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Provides initial values for the application visible parts of a memory descriptor. - Other than its use for initialization, there is no linkage between this - structure and the memory descriptor maintained by the API. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -unlink_op -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A flag to indicate whether the memory descriptor is unlinked when it becomes - inactive, either because the operation threshold drops to zero or because - the maximum offset has been exceeded. - (Note, the check for unlinking a memory descriptor only occurs after a - the completion of a successful operation. - If the threshold is set to zero during initialization or using -\emph on -PtlMDUpdate -\emph default -, the memory descriptor is -\series bold -not -\series default - unlinked.) -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -unlink_nofit -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A flag to indicate whether the memory descriptor is unlinked when the space - remaining in the memory descriptor is not sufficient for a matching operation. - If an incoming message arrives arrives at a memory descriptor that does - not have sufficient space and the -\series bold -PTL_MD_TRUNCATE -\series default - operation is not specified, the memory descriptor will be unlinked. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the newly created - memory descriptor. - The -\family typewriter -handle -\family default - argument can be NULL, in which case the handle will not be returned. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMDBind -\begin_inset LatexCommand \label{sec:mdbind} - -\end_inset - - -\layout LyX-Code - -int PtlMDBind( ptl_handle_ni_t interface, -\newline - ptl_md_t mem_desc, -\newline - ptl_handle_md_t* handle ); -\layout Standard -\noindent -The -\emph on -PtlMDBind -\emph default - operation is used to create a -\begin_inset Quotes eld -\end_inset - -free floating -\begin_inset Quotes erd -\end_inset - - memory descriptor, i.e., a memory descriptor that is not associated with - a match list entry. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid match entry handle. - -\layout Description - -PTL_ILL_MD Indicates that -\family typewriter -mem_desc -\family default - is not a legal memory descriptor. - This may happen because the memory region defined in -\family typewriter -mem_desc -\family default - is invalid or because the network interface associated with the -\family typewriter -eventq -\family default - in -\family typewriter -mem_desc -\family default - is not the same as the network interface, -\family typewriter -interface -\family default -. - -\layout Description - -PTL_INV_EQ Indicates that the event queue associated with -\family typewriter -mem_desc -\family default - is not valid. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - memory descriptor. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -handle -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the network interface with which the memory descriptor will - be associated. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Provides initial values for the application visible parts of a memory descriptor. - Other than its use for initialization, there is no linkage between this - structure and the memory descriptor maintained by the API. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the newly created - memory descriptor. - The -\family typewriter -handle -\family default - argument must be a valid address and cannot be NULL. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMDUnlink -\begin_inset LatexCommand \label{sec:mdfree} - -\end_inset - - -\layout LyX-Code - -int PtlMDUnlink( ptl_handle_md_t mem_desc ); -\layout Standard -\noindent -The -\emph on -PtlMDUnlink -\emph default - function unlinks the memory descriptor from any match list entry it may - be linked to and releases the resources associated with a memory descriptor. - (This function does not free the memory region associated with the memory - descriptor.) This function also releases the resources associated with a - floating memory descriptor. - Only memory descriptors with no pending operations may be unlinked. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_MD Indicates that -\family typewriter -mem_desc -\family default - is not a valid memory descriptor handle. -\layout Description - -PTL_MD_INUSE Indicates that -\family typewriter -mem_desc -\family default - has pending operations and cannot be unlinked. -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the memory descriptor to be released. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlMDUpdate -\begin_inset LatexCommand \label{sec:mdupdate} - -\end_inset - - -\layout LyX-Code - -int PtlMDUpdate( ptl_handle_md_t mem_desc, -\newline - ptl_md_t* old_md, -\newline - ptl_md_t* new_md, -\newline - ptl_handle_eq_t testq ); -\layout Standard -\noindent -The -\emph on -PtlMDUpdate -\emph default - function provides a conditional, atomic update operation for memory descriptors. - The memory descriptor identified by -\family typewriter -mem_desc -\family default - is only updated if the event queue identified by -\family typewriter -testq -\family default - is empty. - The intent is to only enable updates to the memory descriptor when no new - messages have arrived since the last time the queue was checked. - See section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:exmpi} - -\end_inset - - for an example of how this function can be used. -\layout Standard - -If -\family typewriter -new -\family default - is not NULL the memory descriptor identified by handle will be updated - to reflect the values in the structure pointed to by -\family typewriter -new -\family default - if -\family typewriter -testq -\family default - has the value -\family typewriter -PTL_EQ_NONE -\family default - or if the event queue identified by -\family typewriter -testq -\family default - is empty. - If -\family typewriter -old -\family default - is not NULL, the current value of the memory descriptor identified by -\family typewriter -mem_desc -\family default - is recorded in the location identified by -\family typewriter -old -\family default -. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_NOUPDATE Indicates that the update was not performed because -\family typewriter -testq -\family default - was not empty. - -\layout Description - -PTL_INV_MD Indicates that -\family typewriter -mem_desc -\family default - is not a valid memory descriptor handle. - -\layout Description - -PTL_ILL_MD Indicates that the value pointed to by -\family typewriter -new -\family default - is not a legal memory descriptor (e.g., the memory region specified by the - memory descriptor may be invalid). - -\layout Description - -PTL_INV_EQ Indicates that -\family typewriter -testq -\family default - is not a valid event queue handle. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -new -\family default - or -\family typewriter -old -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the memory descriptor to update. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -old_md -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -If -\family typewriter -old_md -\family default - is not the value -\family typewriter -NULL -\family default -, the current value of the memory descriptor will be stored in the location - identified by -\family typewriter -old -\family default -_md. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -new_md -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -If -\family typewriter -new_md -\family default - is not the value -\family typewriter -NULL -\family default -, this argument provides the new values for the memory descriptor, if the - update is performed. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -testq -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for an event queue used to predicate the update. - If -\family typewriter -testq -\family default - is equal to -\family typewriter -PTL_EQ_NONE -\family default -, the update is performed unconditionally. - Otherwise, the update is performed if and only if -\family typewriter -testq -\family default - is empty. - If the update is not performed, the function returns the value -\family typewriter -PTL_NOUPDATE -\family default -. - (Note, the -\family typewriter -testq -\family default - argument does not need to be the same as the event queue associated with - the memory descriptor.) -\end_inset - - - - -\end_inset - - -\layout Standard - -The conditional update can be used to ensure that the memory descriptor - has not changed between the time it was examined and the time it is updated. - In particular, it is needed to support an MPI implementation where the - activity of searching an unexpected message queue and posting a receive - must be atomic. -\layout Section - -Events and Event Queues -\begin_inset LatexCommand \label{sec:eq} - -\end_inset - - -\layout Standard - -Event queues are used to log operations performed on memory descriptors. - They can also be used to hold acknowledgements for completed -\emph on -put -\emph default - operations and to note when the data specified in a -\emph on -put -\emph default - operation has been sent (i.e., when it is safe to reuse the buffer that holds - this data). - Multiple memory descriptors can share a single event queue. -\layout Standard - -In addition to the -\family typewriter -ptl_handle_eq_t -\family default - type, the Portals API defines two types associated with events: The -\family typewriter - -\newline -ptl_event_kind_t -\family default - type defines the kinds of events that can be stored in an event queue. - The -\family typewriter -ptl_event_t -\family default - type defines a structure that holds the information associated with an - event. -\layout Standard - -The Portals API also provides four functions for dealing with event queues: - The -\emph on -PtlEQAlloc -\emph default - function is used to allocate the API resources needed for an event queue, - the -\emph on -PtlEQFree -\emph default - function is used to release these resources, the -\emph on -PtlEQGet -\emph default - function can be used to get the next event from an event queue, and the - -\emph on -PtlEQWait -\emph default - function can be used to block a process (or thread) until an event queue - has at least one event. -\layout Subsection - -Kinds of Events -\begin_inset LatexCommand \label{sec:ek-type} - -\end_inset - - -\layout LyX-Code - -typedef enum { -\newline - PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL, -\newline - PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL, -\newline - PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL, -\newline - PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL, -\newline - PTL_EVENT_ACK, -\newline - PTL_EVENT_UNLINK -\newline -} ptl_event_kind_t; -\layout Standard -\noindent -The Portals API defines fourteen types of events that can be logged in an - event queue: -\layout Description - -PTL_EVENT_GET_START A remote -\emph on -get -\emph default - operation has been started on the memory descriptor. - The memory region associated with this descriptor should not be altered - until the corresponding END or FAIL event is logged. -\layout Description - -PTL_EVENT_GET_END A previously initiated -\emph on -get -\emph default - operation completed successfully. - This event is logged after the reply has been sent by the local node. - As such, the process could free the memory descriptor once it sees this - event. - -\layout Description - -PTL_EVENT_GET_FAIL A previously initiated -\emph on -get -\emph default - operation completed unsuccessfully. - This event is logged after the reply has been sent by the local node. - As such, the process could free the memory descriptor once it sees this - event. - -\layout Description - -PTL_EVENT_PUT_START A remote -\emph on -put -\emph default - operation has been started on the memory descriptor. - The memory region associated with this descriptor should should be considered - volatile until the corresponding END or FAIL event is logged. -\layout Description - -PTL_EVENT_PUT_END A previously initiated -\emph on -put -\emph default - operation completed successfully. - The underlying layers will not alter the memory (on behalf of this operation) - once this event has been logged. - -\layout Description - -PTL_EVENT_PUT_FAIL A previously initiated -\emph on -put -\emph default - operation completed unsuccessfully. - The underlying layers will not alter the memory (on behalf of this operation) - once this event has been logged. - -\layout Description - -PTL_EVENT_REPLY_START A -\emph on -reply -\emph default - operation has been started on the memory descriptor. - -\layout Description - -PTL_EVENT_REPLY_END A previously initiated -\emph on -reply -\emph default - operation has completed successfully . - This event is logged after the data (if any) from the reply has been written - into the memory descriptor. - -\layout Description - -PTL_EVENT_REPLY_FAIL A previously initiated -\emph on -reply -\emph default - operation has completed unsuccessfully. - This event is logged after the data (if any) from the reply has been written - into the memory descriptor. - -\layout Description - -PTL_EVENT_ACK An -\emph on -acknowledgement -\emph default - was received. - This event is logged when the acknowledgement is received -\layout Description - -PTL_EVENT_SEND_START An outgoing -\emph on -send -\emph default - operation has been started. - The memory region associated with this descriptor should not be altered - until the corresponding END or FAIL event is logged. -\layout Description - -PTL_EVENT_SEND_END A previously initiated -\emph on -send -\emph default - operation has completed successfully. - This event is logged after the entire buffer has been sent and it is safe - for the application to reuse the buffer. - -\layout Description - -PTL_EVENT_SEND_FAIL A previously initiated -\emph on -send -\emph default - operation has completed unsuccessfully. - The process can safely manipulate the memory or free the memory descriptor - once it sees this event. -\layout Description - -PTL_EVENT_UNLINK A memory descriptor associated with this event queue has - been automatically unlinked. - This event is not generated when a memory descriptor is explicitly unlinked - by calling -\shape italic -PtlMDUnlink -\shape default -. - This event does not decrement the threshold count. -\layout Subsection - -Event Ordering -\layout Standard - -The Portals API guarantees that a when a process initiates two operations - on a remote process, the operations will be initiated on the remote process - in the same order that they were initiated on the original process. - As an example, if process A intitates two -\emph on -put -\emph default - operations, -\emph on -x -\emph default - and -\emph on -y -\emph default -, on process B, the Portals API guarantees that process A will receive the - -\family typewriter -PTL_EVENT_SEND_START -\family default - events for -\emph on -x -\emph default - and -\emph on -y -\emph default - in the same order that process B receives the -\family typewriter -PTL_EVENT_PUT_START -\family default - events for -\emph on -x -\emph default - and -\emph on -y -\emph default -. - Notice that the API does not guarantee that the start events will be delivered - in the same order that process A initiated the -\emph on -x -\emph default - and -\emph on -y -\emph default - operations. - If process A needs to ensure the ordering of these operations, it should - include code to wait for the initiation of -\emph on -x -\emph default - before it initiates -\emph on -y -\emph default -. -\layout Subsection - -Failure Notification -\layout Standard - -Operations may fail to complete successfully; however, unless the node itself - fails, every operation that is started will eventually complete. - While an operation is in progress, the memory associated with the operation - should not be viewed (in the case of a put or a reply) or altered (in the - case of a send or get). - Operation completion, whether successful or unsuccessful, is final. - That is, when an operation completes, the memory associated with the operation - will no longer be read or altered by the operation. - A network interface can use the -\family typewriter -ptl_ni_fail_t -\family default - to define more specific information regarding the failure of the operation - and record this information in the -\family typewriter -ni_fail_type -\family default - field of the event. -\layout Subsection - -The Event Type -\begin_inset LatexCommand \label{sec:event-type} - -\end_inset - - -\layout LyX-Code - -typedef struct { -\newline - ptl_event_kind_t type; -\newline - ptl_process_id_t initiator; -\newline - ptl_uid_t uid; -\layout LyX-Code - - ptl_pt_index_t portal; -\newline - ptl_match_bits_t match_bits; -\newline - ptl_size_t rlength; -\newline - ptl_size_t mlength; -\newline - ptl_size_t offset; -\newline - ptl_handle_md_t md_handle; -\newline - ptl_md_t mem_desc; -\newline - ptl_hdr_data_t hdr_data; -\newline - ptl_seq_t link; -\newline - ptl_ni_fail_t ni_fail_type; -\newline - volatile ptl_seq_t sequence; -\newline -} ptl_event_t; -\layout Standard -\noindent -An event structure includes the following members: -\layout Description - -type Indicates the type of the event. - -\layout Description - -initiator The id of the initiator. - -\layout Description - -portal The Portal table index specified in the request. - -\layout Description - -match_bits A copy of the match bits specified in the request. - See section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - for more information on match bits. - -\layout Description - -rlength The length (in bytes) specified in the request. - -\layout Description - -mlength The length (in bytes) of the data that was manipulated by the operation. - For truncated operations, the manipulated length will be the number of - bytes specified by the memory descriptor (possibly with an offset) operation. - For all other operations, the manipulated length will be the length of - the requested operation. - -\layout Description - -offset Is the displacement (in bytes) into the memory region that the operation - used. - The offset can be determined by the operation (see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:datamovement} - -\end_inset - -) for a remote managed memory descriptor, or by the local memory descriptor - (see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - -). - -\layout Description - -md_handle Is the handle to the memory descriptor associated with the event. -\layout Description - -mem_desc Is the state of the memory descriptor immediately after the event - has been processed. - -\layout Description - -hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - -). - -\layout Description - -link The -\emph on -link -\emph default - member is used to link -\family typewriter -START -\family default - events with the -\family typewriter -END -\family default - or -\family typewriter -FAIL -\family default - event that signifies completion of the operation. - The -\emph on -link -\emph default - member will be the same for the two events associated with an operation. - The link member is also used to link an -\family typewriter -UNLINK -\family default - event with the event that caused the memory descriptor to be unlinked. -\layout Description - -sequence The sequence number for this event. - Sequence numbers are unique to each event. -\layout Comment - -The -\emph on -sequence -\emph default - member is the last member and is volatile to support SMP implementations. - When an event structure is filled in, the -\emph on -sequence -\emph default - member should be written after all other members have been updated. - Moreover, a memory barrier should be inserted between the updating of other - members and the updating of the -\emph on -sequence -\emph default - member. -\layout Subsection - -PtlEQAlloc -\begin_inset LatexCommand \label{sec:eqalloc} - -\end_inset - - -\layout LyX-Code - -int PtlEQAlloc( ptl_handle_ni_t interface, -\newline - ptl_size_t count, -\newline - ptl_handle_eq_t* handle ); -\layout Standard -\noindent -The -\emph on -PtlEQAlloc -\emph default - function is used to build an event queue. - -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_NOSPACE Indicates that there is insufficient memory to allocate the - event queue. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -handle -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the interface with which the event queue will be associated. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -count -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The number of events that can be stored in the event queue. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold a handle for the newly created - event queue. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlEQFree -\begin_inset LatexCommand \label{sec:eqfree} - -\end_inset - - -\layout LyX-Code - -int PtlEQFree( ptl_handle_eq_t eventq ); -\layout Standard -\noindent -The -\emph on -PtlEQFree -\emph default - function releases the resources associated with an event queue. - It is up to the user to insure that no memory descriptors are associated - with the event queue once it is freed. - -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_EQ Indicates that -\family typewriter -eventq -\family default - is not a valid event queue handle. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -eventq -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard - -A handle for the event queue to be released. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlEQGet -\begin_inset LatexCommand \label{sec:eqget} - -\end_inset - - -\layout LyX-Code - -int PtlEQGet( ptl_handle_eq_t eventq, -\newline - ptl_event_t* event ); -\layout Standard -\noindent -The -\emph on -PTLEQGet -\emph default - function is a nonblocking function that can be used to get the next event - in an event queue. - The event is removed from the queue. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at - least one event between this event and the last event obtained (using -\emph on -PtlEQGet -\emph default - or -\emph on -PtlEQWait -\emph default -) from this event queue has been dropped due to limited space in the event - queue. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_EQ_EMPTY Indicates that -\family typewriter -eventq -\family default - is empty or another thread is waiting on -\emph on -PtlEQWait -\emph default -. - -\layout Description - -PTL_INV_EQ Indicates that -\family typewriter -eventq -\family default - is not a valid event queue handle. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -event -\family default - is not a legal address. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -eventq -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the event queue. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -event -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the values associated with - the next event in the event queue. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlEQWait -\begin_inset LatexCommand \label{sec:eqwait} - -\end_inset - - -\layout LyX-Code - -int PtlEQWait( ptl_handle_eq_t eventq, -\newline - ptl_event_t* event ); -\layout Standard -\noindent -The -\emph on -PTLEQWait -\emph default - function can be used to block the calling process (thread) until there - is an event in an event queue. - This function also returns the next event in the event queue and removes - this event from the queue. - This is the only blocking operation in the Portals 3.2 API. - In the event that multiple threads are waiting on the same event queue, - PtlEQWait is guaranteed to wake exactly one thread, but the order in which - they are awakened is not specified. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at - least one event between this event and the last event obtained (using -\emph on -PtlEQGet -\emph default - or -\emph on -PtlEQWait -\emph default -) from this event queue has been dropped due to limited space in the event - queue. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_EQ Indicates that -\family typewriter -eventq -\family default - is not a valid event queue handle. - -\layout Description - -PTL_SEGV Indicates that -\family typewriter -event -\family default - is not a legal address. - queue handle. - -\layout Subsubsection - -Arguments -\layout Standard -\noindent - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -eventq -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the event queue to wait on. - The calling process (thread) will be blocked until -\family typewriter -eventq -\family default - is not empty. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -event -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -output -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -On successful return, this location will hold the values associated with - the next event in the event queue. -\end_inset - - - - -\end_inset - - -\layout Section - -The Access Control Table -\begin_inset LatexCommand \label{sec:ac} - -\end_inset - - -\layout Standard - -Processes can use the access control table to control which processes are - allowed to perform operations on Portal table entries. - Each communication interface has a Portal table and an access control table. - The access control table for the default interface contains an entry at - index zero that allows all processes with the same user id to communicate. - Entries in the access control table can be manipulated using the -\emph on -PtlACEntry -\emph default - function. -\layout Subsection - -PtlACEntry -\begin_inset LatexCommand \label{sec:acentry} - -\end_inset - - -\layout LyX-Code - -int PtlACEntry( ptl_handle_ni_t interface, -\newline - ptl_ac_index_t index, -\newline - ptl_process_id_t matchid, -\newline - ptl_uid_t user_id, -\newline - ptl_pt_index_t portal ); -\layout Standard -\noindent -The -\emph on -PtlACEntry -\emph default - function can be used to update an entry in the access control table for - an interface. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_NI Indicates that -\family typewriter -interface -\family default - is not a valid network interface handle. - -\layout Description - -PTL_AC_INV_INDEX Indicates that -\family typewriter -index -\family default - is not a valid access control table index. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -matchid -\family default - is not a valid process identifier. - -\layout Description - -PTL_PT_INV_INDEX Indicates that -\family typewriter -portal -\family default - is not a valid Portal table index. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the interface to use. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The index of the entry in the access control table to update. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -matchid -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the process(es) that are allowed to perform operations. - The constants -\family typewriter -PTL_PID_ANY -\family default - and -\family typewriter -PTL_NID_ANY -\family default - can be used to wildcard either of the ids in the -\family typewriter -ptl_process_id_t -\family default - structure. - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -user_id -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the user that is allowed to perform operations. - The value -\family typewriter -PTL_UID_ANY -\family default - can be used to wildcard the user. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -portal -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Identifies the Portal index(es) that can be used. - The value -\family typewriter -PTL_PT_INDEX_ANY -\family default - can be used to wildcard the Portal index. -\end_inset - - - - -\end_inset - - -\layout Section - -Data Movement Operations -\begin_inset LatexCommand \label{sec:datamovement} - -\end_inset - - -\layout Standard - -The Portals API provides two data movement operations: -\emph on -PtlPut -\emph default - and -\emph on -PtlGet -\emph default -. -\layout Subsection - -PtlPut -\begin_inset LatexCommand \label{sec:put} - -\end_inset - - -\layout LyX-Code - -typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t; -\newline - -\newline -int PtlPut( ptl_handle_md_t mem_desc, -\newline - ptl_ack_req_t ack_req, -\newline - ptl_process_id_t target, -\newline - ptl_pt_index_t portal, -\newline - ptl_ac_index_t cookie, -\newline - ptl_match_bits_t match_bits, -\newline - ptl_size_t offset, -\newline - ptl_hdr_data_t hdr_data ); -\layout Standard -\noindent -Values of the type -\family typewriter -ptl_ack_req_t -\family default - are used to control whether an acknowledgement should be sent when the - operation completes (i.e., when the data has been written to a memory descriptor - of the -\family typewriter -target -\family default - process). - The value -\family typewriter -PTL_ACK_REQ -\family default - requests an acknowledgement, the value -\family typewriter -PTL_NOACK_REQ -\family default - requests that no acknowledgement should be generated. -\layout Standard - -The -\emph on -PtlPut -\emph default - function initiates an asynchronous put operation. - There are several events associated with a put operation: initiation of - the send on the local node ( -\family typewriter -PTL_EVENT_SEND_START -\family default -), completion of the send on the local node ( -\family typewriter -PTL_EVENT_SEND_END -\family default - or -\family typewriter -PTL_EVENT_SEND_FAIL -\family default -), and, when the send completes successfully, the receipt of an acknowledgement - ( -\family typewriter -PTL_EVENT_ACK -\family default -) indicating that the operation was accepted by the target. - These events will be logged in the event queue associated with the memory - descriptor ( -\family typewriter -mem_desc -\family default -) used in the put operation. - Using a memory descriptor that does not have an associated event queue - results in these events being discarded. - In this case, the application must have another mechanism (e.g., a higher - level protocol) for determining when it is safe to modify the memory region - associated with the memory descriptor. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_MD Indicates that -\family typewriter -mem_desc -\family default - is not a valid memory descriptor. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -target -\family default - is not a valid process id. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the memory descriptor that describes the memory to be sent. - If the memory descriptor has an event queue associated with it, it will - be used to record events when the message has been sent (PTL_EVENT_SEND_START, - PTL_EVENT_SEND_END). - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ack_req -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -Controls whether an acknowledgement event is requested. - Acknowledgements are only sent when they are requested by the initiating - process -\series bold -and -\series default - the memory descriptor has an event queue -\series bold -and -\series default - the target memory descriptor enables them. - Allowed constants: -\family typewriter -PTL_ACK_REQ -\family default -, -\family typewriter -PTL_NOACK_REQ -\family default -. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A process id for the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -portal -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The index in the remote Portal table. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -cookie -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The index into the access control table of the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -match_bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The match bits to use for message selection at the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The offset into the target memory descriptor (only used when the target - memory descriptor has the -\family typewriter -PTL_MD_MANAGE_REMOTE -\family default - option set). -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -hdr_data -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -64 bits of user data that can be included in message header. - This data is written to an event queue entry at the target if an event - queue is present on the matching memory descriptor. -\end_inset - - - - -\end_inset - - -\layout Subsection - -PtlGet -\begin_inset LatexCommand \label{sec:get} - -\end_inset - - -\layout LyX-Code - -int PtlGet( ptl_handle_md_t mem_desc, -\newline - ptl_process_id_t target, -\newline - ptl_pt_index_t portal, -\newline - ptl_ac_index_t cookie, -\newline - ptl_match_bits_t match_bits, -\newline - ptl_size_t offset ); -\layout Standard -\noindent -The -\emph on -PtlGet -\emph default - function initiates a remote read operation. - There are two event pairs associated with a get operation , when the data - is sent from the remote node, a -\family typewriter -PTL_EVENT_GET{START|END} -\family default - event pair is registered on the remote node; and when the data is returned - from the remote node a -\family typewriter -PTL_EVENT_REPLY{START|END} -\family default - event pair is registered on the local node. -\layout Subsubsection - -Return Codes -\layout Description - -PTL_OK Indicates success. - -\layout Description - -PTL_NOINIT Indicates that the Portals API has not been successfully initialized. - -\layout Description - -PTL_INV_MD Indicates that -\family typewriter -mem_desc -\family default - is not a valid memory descriptor. - -\layout Description - -PTL_INV_PROC Indicates that -\family typewriter -target -\family default - is not a valid process id. - -\layout Subsubsection - -Arguments -\layout Standard - - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A handle for the memory descriptor that describes the memory into which - the requested data will be received. - The memory descriptor can have an event queue associated with it to record - events, such as when the message receive has started ( -\family typewriter -PTL_EVENT_REPLY -\family default -_ -\family typewriter -START -\family default -). -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -A process id for the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -portal -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The index in the remote Portal table. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -cookie -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The index into the access control table of the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -match_bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The match bits to use for message selection at the target process. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -input -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -The offset into the target memory descriptor (only used when the target - memory descriptor has the -\family typewriter -PTL_MD_MANAGE_REMOTE -\family default - option set). -\end_inset - - - - -\end_inset - - -\layout Section - -Summary -\layout Standard - - -\begin_inset LatexCommand \label{sec:summary} - -\end_inset - - We conclude this section by summarizing the names introduced by the Portals - 3.2 API. - We start by summarizing the names of the types introduced by the API. - This is followed by a summary of the functions introduced by the API. - Which is followed by a summary of the function return codes. - Finally, we conclude with a summary of the other constant values introduced - by the API. -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:types} - -\end_inset - - presents a summary of the types defined by the Portals API. - The first column in this table gives the type name, the second column gives - a brief description of the type, the third column identifies the section - where the type is defined, and the fourth column lists the functions that - have arguments of this type. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Types Defined by the Portals 3.2 API -\begin_inset LatexCommand \label{tab:types} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\noindent - -\size small - -\begin_inset Tabular - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold - Name -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold - Meaning -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold - Sect -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold - Functions -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ac_index_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -indexes for an access control table -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:index-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlACEntry, PtlPut, PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ack_req_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -acknowledgement request types -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlPut -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -kinds of events -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -information about events -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:event-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlEQGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -plt_seq_t -\end_inset - - -\begin_inset Text - -\layout Standard - -event sequence number -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:event-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlEQGet, PtlEQWait -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_any_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -handles for any object -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIHandle -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_eq_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -handles for event queues -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_md_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -handles for memory descriptors -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert, - PtlPut, PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_me_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -handles for match entries -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_ni_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -handles for network interfaces -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut, - PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_nid_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -node identifiers -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlGetId,PtlACEntry -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pid_t -\end_inset - - -\begin_inset Text - -\layout Standard - -process identifier -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlGetId, PtlACEntry -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_uid_t -\end_inset - - -\begin_inset Text - -\layout Standard - -user indentifier -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlGetUid, PtlACEntry -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ins_pos_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -insertion position (before or after) -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMEAttach, PtlMEAttachAny, PtlMEInsert -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_interface_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -identifiers for network interfaces -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_match_bits_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -match (and ignore) bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:mb-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_md_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -memory descriptors -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDAttach, PtlMDUpdate -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ni_fail_t -\end_inset - - -\begin_inset Text - -\layout Standard - -network interface-specific failures -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlEQGet, PtlEQWait -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -process identifiers -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:pid-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -indexes for Portal tables -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:index-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMEAttach, PtlMEAttachAny, PtlACEntry -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -sizes -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:size-t} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlEQAlloc, PtlPut, PtlGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_sr_index_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -indexes for status registers -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:stat-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIStatus -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_sr_value_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -values in status registers -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:stat-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIStatus -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_unlink_t -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -unlink options -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:func} - -\end_inset - - presents a summary of the functions defined by the Portals API. - The first column in this table gives the name for the function, the second - column gives a brief description of the operation implemented by the function, - and the third column identifies the section where the function is defined. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Functions Defined by the Portals 3.2 API -\begin_inset LatexCommand \label{tab:func} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - -Name -\end_inset - - -\begin_inset Text - -\layout Standard - - Operation -\end_inset - - -\begin_inset Text - -\layout Standard - - Section -\end_inset - - - - -\begin_inset Text - -\layout Standard - -PtlACEntry -\end_inset - - -\begin_inset Text - -\layout Standard - - update an entry in an access control table -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ac} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlEQAlloc -\end_inset - - -\begin_inset Text - -\layout Standard - - create an event queue -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlEQGet -\end_inset - - -\begin_inset Text - -\layout Standard - - get the next event from an event queue -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlEQFree -\end_inset - - -\begin_inset Text - -\layout Standard - - release the resources for an event queue -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlEQWait -\end_inset - - -\begin_inset Text - -\layout Standard - - wait for a new event in an event queue -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:eq} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlFini -\end_inset - - -\begin_inset Text - -\layout Standard - - shutdown the Portals API -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:init} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlGet -\end_inset - - -\begin_inset Text - -\layout Standard - - perform a get operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:datamovement} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlGetId -\end_inset - - -\begin_inset Text - -\layout Standard - - get the id for the current process -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:pid} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlInit -\end_inset - - -\begin_inset Text - -\layout Standard - - initialize the Portals API -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:init} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMDAttach -\end_inset - - -\begin_inset Text - -\layout Standard - - create a memory descriptor and attach it to a match entry -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMDBind -\end_inset - - -\begin_inset Text - -\layout Standard - - create a free-floating memory descriptor -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:mdbind} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMDUnlink -\end_inset - - -\begin_inset Text - -\layout Standard - - remove a memory descriptor from a list and release its resources -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMDUpdate -\end_inset - - -\begin_inset Text - -\layout Standard - - update a memory descriptor -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMEAttach -\end_inset - - -\begin_inset Text - -\layout Standard - -create a match entry and attach it to a Portal table -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -PtlMEAttachAny -\end_inset - - -\begin_inset Text - -\layout Standard - -create a match entry and attach it to a free Portal table entry -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:attachany} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMEInsert -\end_inset - - -\begin_inset Text - -\layout Standard - - create a match entry and insert it in a list -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlMEUnlink -\end_inset - - -\begin_inset Text - -\layout Standard - - remove a match entry from a list and release its resources -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:me} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlNIDist -\end_inset - - -\begin_inset Text - -\layout Standard - - get the distance to another process -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlNIFini -\end_inset - - -\begin_inset Text - -\layout Standard - - shutdown a network interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlNIHandle -\end_inset - - -\begin_inset Text - -\layout Standard - - get the network interface handle for an object -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlNIInit -\end_inset - - -\begin_inset Text - -\layout Standard - - initialize a network interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlNIStatus -\end_inset - - -\begin_inset Text - -\layout Standard - - read a network interface status register -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - PtlPut -\end_inset - - -\begin_inset Text - -\layout Standard - - perform a put operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:datamovement} - -\end_inset - - -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:retcodes} - -\end_inset - - summarizes the return codes used by functions defined by the Portals API. - All of these constants are integer values. - The first column of this table gives the symbolic name for the constant, - the second column gives a brief description of the value, and the third - column identifies the functions that can return this value. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Function Return Codes for the Portals 3.2 API -\begin_inset LatexCommand \label{tab:retcodes} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Name -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Meaning -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Functions -\series default - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_AC_INV_INDEX -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid access control table index -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlACEntry -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EQ_DROPPED -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -at least one event has been dropped -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlEQGet, PtlWait -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EQ_EMPTY -\end_inset - - -\begin_inset Text - -\layout Standard - -no events available in an event queue -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlEQGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -error during initialization or cleanup -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlInit, PtlFini -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_ILL_MD -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -illegal memory descriptor values -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDAttach, PtlMDBind, PtlMDUpdate -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INIT_DUP -\end_inset - - -\begin_inset Text - -\layout Standard - -duplicate initialization of an interface -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INIT_INV -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -initialization of an invalid interface -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INUSE -\end_inset - - -\begin_inset Text - -\layout Standard - -the ME already has an MD -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDAttach -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_ASIZE -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid access control table size -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_EQ -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid event queue handle -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDUpdate, PtlEQFree, PtlEQGet -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_HANDLE -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid handle -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIHandle -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_MD -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid memory descriptor handle -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDUnlink, PtlMDUpdate -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_ME -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid match entry handle -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlMDAttach -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_NI -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid network interface handle -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_PROC -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid process identifier -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_PTINDEX -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid Portal table index -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlMEAttach -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_REG -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid status register -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlNIStatus -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INV_SR_INDX -\end_inset - - -\begin_inset Text - -\layout Standard - -invalid status register index -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlNIStatus -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_ML_TOOLONG -\end_inset - - -\begin_inset Text - -\layout Standard - -match list too long -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlMEAttach, PtlMEInsert -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_INUSE -\end_inset - - -\begin_inset Text - -\layout Standard - -MD has pending operations -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlMDUnlink -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_NOINIT -\end_inset - - -\begin_inset Text - -\layout Standard - -uninitialized API -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - -\emph on -all -\emph default -, except PtlInit -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_NOSPACE -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -insufficient memory -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_NOUPDATE -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - no update was performed -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - PtlMDUpdate -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_PT_FULL -\end_inset - - -\begin_inset Text - -\layout Standard - -Portal table is full -\end_inset - - -\begin_inset Text - -\layout Standard - -PtlMEAttachAny -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_OK -\end_inset - - -\begin_inset Text - -\layout Standard - - success -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent - -\emph on -all -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_SEGV -\end_inset - - -\begin_inset Text - -\layout Standard - -addressing violation -\end_inset - - -\begin_inset Text - -\layout Standard -\noindent -PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate, - PtlEQAlloc, PtlEQGet, PtlEQWait -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:oconsts} - -\end_inset - - summarizes the remaining constant values introduced by the Portals API. - The first column in this table presents the symbolic name for the constant, - the second column gives a brief description of the value, the third column - identifies the type for the value, and the fourth column identifies the - sections in which the value is mentioned. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Other Constants Defined by the Portals 3.2 API -\begin_inset LatexCommand \label{tab:oconsts} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Name -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Meaning -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Base type -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Intr. -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Ref. -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_ACK_REQ -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -request an acknowledgement -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ack_req_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EQ_NONE -\end_inset - - -\begin_inset Text - -\layout Standard - -a NULL event queue handle -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_eq_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:handle-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md} - -\end_inset - -, -\begin_inset LatexCommand \ref{sec:mdupdate} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_GET_START -\end_inset - - -\begin_inset Text - -\layout Standard - -get event start -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_GET_END -\end_inset - - -\begin_inset Text - -\layout Standard - -get event end -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_GET_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -get event fail -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_PUT_START -\end_inset - - -\begin_inset Text - -\layout Standard - -put event start -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_PUT_END -\end_inset - - -\begin_inset Text - -\layout Standard - -put event end -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_PUT_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -put event fail -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_REPLY_START -\end_inset - - -\begin_inset Text - -\layout Standard - -reply event start -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_REPLY_END -\end_inset - - -\begin_inset Text - -\layout Standard - -reply event end -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_REPLY_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -reply event fail -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_ACK_START -\end_inset - - -\begin_inset Text - -\layout Standard - -acknowledgement event start -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_ACK_END -\end_inset - - -\begin_inset Text - -\layout Standard - -acknowledgement event end -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_ACK_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -acknowledgement event fail -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_SEND_START -\end_inset - - -\begin_inset Text - -\layout Standard - -send event start -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_SEND_END -\end_inset - - -\begin_inset Text - -\layout Standard - -send event end -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_SEND_FAIL -\end_inset - - -\begin_inset Text - -\layout Standard - -send event fail -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_EVENT_UNLINK -\end_inset - - -\begin_inset Text - -\layout Standard - -unlink event -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_event_kind_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ek-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_PID_ANY -\end_inset - - -\begin_inset Text - -\layout Standard - -wildcard for process id fields -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pid_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -, -\begin_inset LatexCommand \ref{sec:acentry} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_NID_ANY -\end_inset - - -\begin_inset Text - -\layout Standard - -wildcard for node id fields -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_nid_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -, -\begin_inset LatexCommand \ref{sec:acentry} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_UID_ANY -\end_inset - - -\begin_inset Text - -\layout Standard - -wildcard for user id -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_uid_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:id-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meattach} - -\end_inset - -, -\begin_inset LatexCommand \ref{sec:acentry} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_IFACE_DEFAULT -\end_inset - - -\begin_inset Text - -\layout Standard - -default interface -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_interface_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:ni-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INS_AFTER -\end_inset - - -\begin_inset Text - -\layout Standard - -insert after -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ins_pos_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meinsert} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_INS_BEFORE -\end_inset - - -\begin_inset Text - -\layout Standard - -insert before -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ins_pos_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:meinsert} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_ACK_DISABLE -\end_inset - - -\begin_inset Text - -\layout Standard - -a flag to disable acknowledgements -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_MANAGE_REMOTE -\end_inset - - -\begin_inset Text - -\layout Standard - -a flag to enable the use of remote offsets -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - -, -\begin_inset LatexCommand \ref{sec:get} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_OP_GET -\end_inset - - -\begin_inset Text - -\layout Standard - -a flag to enable get operations -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_OP_PUT -\end_inset - - -\begin_inset Text - -\layout Standard - -a flag to enable put operations -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_THRESH_INF -\end_inset - - -\begin_inset Text - -\layout Standard - -infinite threshold for a memory descriptor -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_MD_TRUNCATE -\end_inset - - -\begin_inset Text - -\layout Standard - -a flag to enable truncation of a request -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:md-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_NOACK_REQ -\end_inset - - -\begin_inset Text - -\layout Standard - -request no acknowledgement -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ack_req_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:put} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_PT_INDEX_ANY -\end_inset - - -\begin_inset Text - -\layout Standard - -wildcard for Portal indexes -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:acentry} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_RETAIN -\end_inset - - -\begin_inset Text - -\layout Standard - -disable unlinking -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_unlink_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:mdattach} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_SR_DROP_COUNT -\end_inset - - -\begin_inset Text - -\layout Standard - -index for the dropped count register -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_sr_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:stat-type} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:nistatus} - -\end_inset - - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - -\family typewriter -PTL_UNLINK -\end_inset - - -\begin_inset Text - -\layout Standard - -enable unlinking -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_unlink_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\begin_inset LatexCommand \ref{sec:mdattach} - -\end_inset - - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Chapter - -The Semantics of Message Transmission -\begin_inset LatexCommand \label{sec:semantics} - -\end_inset - - -\layout Standard - -The portals API uses four types of messages: put requests, acknowledgements, - get requests, and replies. - In this section, we describe the information passed on the wire for each - type of message. - We also describe how this information is used to process incoming messages. -\layout Section - -Sending Messages -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:put-wire} - -\end_inset - - summarizes the information that is transmitted for a put request. - The first column provides a descriptive name for the information, the second - column provides the type for this information, the third column identifies - the source of the information, and the fourth column provides additional - notes. - Most information that is transmitted is obtained directly from the -\emph on -PtlPut -\emph default - operation. - Notice that the handle for the memory descriptor used in the -\emph on -PtlPut -\emph default - operation is transmitted even though this value cannot be interpreted by - the target. - A value of anything other than -\family typewriter -PTL_MD_NONE -\family default -, is interpreted as a request for an acknowledgement. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Information Passed in a Put Request -\begin_inset LatexCommand \label{tab:put-wire} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Information -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Type -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -\emph on -PtlPut -\emph default - arg -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Notes -\end_inset - - - - -\begin_inset Text - -\layout Standard - -operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -indicates a put request -\end_inset - - - - -\begin_inset Text - -\layout Standard - -initiator -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -local information -\end_inset - - - - -\begin_inset Text - -\layout Standard - -user -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_uid_t -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -local information -\end_inset - - - - -\begin_inset Text - -\layout Standard - -target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -target -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -portal index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -portal -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -cookie -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ac_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -cookie -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -match bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_match_bits_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -match_bits -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -offset -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_md_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -no ack if -\family typewriter -PTL_MD_NONE -\end_inset - - - - -\begin_inset Text - -\layout Standard - -length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -length -\family default - member -\end_inset - - - - -\begin_inset Text - -\layout Standard - -data -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family roman -\emph on -bytes -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -start -\family default - and -\family typewriter -length -\family default - members -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:ack-wire} - -\end_inset - - summarizes the information transmitted in an acknowledgement. - Most of the information is simply echoed from the put request. - Notice that the initiator and target are obtained directly from the put - request, but are swapped in generating the acknowledgement. - The only new piece of information in the acknowledgement is the manipulated - length which is determined as the put request is satisfied. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Information Passed in an Acknowledgement -\begin_inset LatexCommand \label{tab:ack-wire} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Information -\series default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Type -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Put Information -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Notes -\end_inset - - - - -\begin_inset Text - -\layout Standard - -operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - - indicates an acknowledgement -\end_inset - - - - -\begin_inset Text - -\layout Standard - - initiator -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - - target -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - - initiator -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - - portal index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - - portal index -\end_inset - - -\begin_inset Text - -\layout Standard - - echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - - match bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_match_bits_t -\end_inset - - -\begin_inset Text - -\layout Standard - - match bits -\end_inset - - -\begin_inset Text - -\layout Standard - - echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - - offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\end_inset - - -\begin_inset Text - -\layout Standard - - offset -\end_inset - - -\begin_inset Text - -\layout Standard - - echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - - memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter - ptl_handle_md_t -\end_inset - - -\begin_inset Text - -\layout Standard - - memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - - echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - - requested length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter - ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - - length -\end_inset - - -\begin_inset Text - -\layout Standard - - echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - - manipulated length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter - ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - - obtained from the operation -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:get-wire} - -\end_inset - - summarizes the information that is transmitted for a get request. - Like the information transmitted in a put request, most of the information - transmitted in a get request is obtained directly from the -\emph on -PtlGet -\emph default - operation. - Unlike put requests, get requests do not include the event queue handle. - In this case, the reply is generated whenever the operation succeeds and - the memory descriptor must not be unlinked until the reply is received. - As such, there is no advantage to explicitly sending the event queue handle. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Information Passed in a Get Request -\begin_inset LatexCommand \label{tab:get-wire} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Information -\series default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Type -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -\emph on -PtlGet -\emph default - argument -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Notes -\end_inset - - - - -\begin_inset Text - -\layout Standard - -operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -indicates a get operation -\end_inset - - - - -\begin_inset Text - -\layout Standard - -initiator -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -local information -\end_inset - - - - -\begin_inset Text - -\layout Standard - -user -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_uid_t -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -local information -\end_inset - - - - -\begin_inset Text - -\layout Standard - -target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -target -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -portal index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -portal -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -cookie -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_ac_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -cookie -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -match bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_match_bits_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -match_bits -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -offset -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_md_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -mem_desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -length -\family default - member -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Standard - -Table\SpecialChar ~ - -\begin_inset LatexCommand \ref{tab:reply-wire} - -\end_inset - - summarizes the information transmitted in a reply. - Like an acknowledgement, most of the information is simply echoed from - the get request. - The initiator and target are obtained directly from the get request, but - are swapped in generating the acknowledgement. - The only new information in the acknowledgement are the manipulated length - and the data, which are determined as the get request is satisfied. -\layout Standard - - -\begin_inset Float table -placement htbp -wide false -collapsed false - -\layout Caption - -Information Passed in a Reply -\begin_inset LatexCommand \label{tab:reply-wire} - -\end_inset - - -\layout Standard - - -\begin_inset ERT -status Collapsed - -\layout Standard - -\backslash -medskip -\end_inset - - -\layout Standard -\align center - -\size small - -\begin_inset Tabular - - - - - - - - -\begin_inset Text - -\layout Standard - - -\series bold -Information -\series default - -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Type -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Put Information -\end_inset - - -\begin_inset Text - -\layout Standard - - -\series bold -Notes -\end_inset - - - - -\begin_inset Text - -\layout Standard - -operation -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -int -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -indicates an acknowledgement -\end_inset - - - - -\begin_inset Text - -\layout Standard - -initiator -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - -target -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -target -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_process_id_t -\end_inset - - -\begin_inset Text - -\layout Standard - -initiator -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - - - -\begin_inset Text - -\layout Standard - -portal index -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_pt_index_t -\end_inset - - -\begin_inset Text - -\layout Standard - -portal index -\end_inset - - -\begin_inset Text - -\layout Standard - -echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - -match bits -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_match_bits_t -\end_inset - - -\begin_inset Text - -\layout Standard - -match bits -\end_inset - - -\begin_inset Text - -\layout Standard - -echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - -offset -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -offset -\end_inset - - -\begin_inset Text - -\layout Standard - -echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - -memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_handle_md_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -memory desc -\end_inset - - -\begin_inset Text - -\layout Standard - -echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - -requested length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -length -\end_inset - - -\begin_inset Text - -\layout Standard - -echo -\end_inset - - - - -\begin_inset Text - -\layout Standard - -manipulated length -\end_inset - - -\begin_inset Text - -\layout Standard - - -\family typewriter -ptl_size_t -\family default - -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -obtained from the operation -\end_inset - - - - -\begin_inset Text - -\layout Standard - -data -\end_inset - - -\begin_inset Text - -\layout Standard - - -\emph on -bytes -\end_inset - - -\begin_inset Text - -\layout Standard - -\end_inset - - -\begin_inset Text - -\layout Standard - -obtained from the operation -\end_inset - - - - -\end_inset - - -\end_inset - - -\layout Section - -Receiving Messages -\begin_inset LatexCommand \label{sec:receiving} - -\end_inset - - -\layout Standard - -When an incoming message arrives on a network interface, the communication - system first checks that the target process identified in the request is - a valid process that has initialized the network interface (i.e., that the - target process has a valid Portal table). - If this test fails, the communication system discards the message and increment -s the dropped message count for the interface. - The remainder of the processing depends on the type of the incoming message. - Put and get messages are subject to access control checks and translation - (searching a match list), while acknowledgement and reply messages bypass - the access control checks and the translation step. -\layout Standard - -Acknowledgement messages include a handle for the memory descriptor used - in the original -\emph on -PtlPut -\emph default - operation. - This memory descriptor will identify the event queue where the event should - be recorded. - Upon receipt of an acknowledgement, the runtime system only needs to confirm - that the memory descriptor and event queue still exist and that there is - space for another event. - Should the any of these conditions fail, the message is simply discarded - and the dropped message count for the interface is incremented. - Otherwise, the system builds an acknowledgement event from the information - in the acknowledgement message and adds it to the event queue. -\layout Standard - -Reception of reply messages is also relatively straightforward. - Each reply message includes a handle for a memory descriptor. - If this descriptor exists, it is used to receive the message. - A reply message will be dropped if the memory descriptor identified in - the request doesn't exist. - In either of this case, the dropped message count for the interface is - incremented. - These are the only reasons for dropping reply messages. - Every memory descriptor accepts and truncates incoming reply messages, - eliminating the other potential reasons for rejecting a reply message. -\layout Standard - -The critical step in processing an incoming put or get request involves - mapping the request to a memory descriptor. - This step starts by using the Portal index in the incoming request to identify - a list of match entries. - This list of match entries is searched in order until a match entry is - found whose match criteria matches the match bits in the incoming request - and whose memory descriptor accepts the request. -\layout Standard - -Because acknowledge and reply messages are generated in response to requests - made by the process receiving these messages, the checks performed by the - runtime system for acknowledgements and replies are minimal. - In contrast, put and get messages are generated by remote processes and - the checks performed for these messages are more extensive. - Incoming put or get messages may be rejected because: -\layout Itemize - -the Portal index supplied in the request is not valid; -\layout Itemize - -the cookie supplied in the request is not a valid access control entry; - -\layout Itemize - -the access control entry identified by the cookie does not match the identifier - of the requesting process; -\layout Itemize - -the access control entry identified by the access control entry does not - match the Portal index supplied in the request; or -\layout Itemize - -the match bits supplied in the request do not match any of the match entries - with a memory descriptor that accepts the request. - -\layout Standard - -In all cases, if the message is rejected, the incoming message is discarded - and the dropped message count for the interface is incremented. -\layout Standard - -A memory descriptor may reject an incoming request for any of the following - reasons: -\layout Itemize - -the -\family typewriter -PTL_MD_PUT -\family default - or -\family typewriter -PTL_MD_GET -\family default - option has not been enabled and the operation is put or get, respectively; - -\layout Itemize - -the length specified in the request is too long for the memory descriptor - and the -\family typewriter -PTL_MD_TRUNCATE -\family default - option has not been enabled. -\layout Chapter - -Examples -\begin_inset LatexCommand \label{sec:examples} - -\end_inset - - -\layout Comment - -The examples presented in this chapter have not been updated to reflect - the current API. -\layout Standard - -In this section we present several example to illustrate expected usage - patterns for the Portals 3.2 API. - The first example describes how to implement parallel servers using the - features of the Portals 3.2 API. - This example covers the access control list and the use of remote managed - offsets. - The second example presents an approach to dealing with dropped requests. - This example covers aspects of match lists and memory descriptors. - The final example covers message reception in MPI. - This example illustrates more sophisticated uses of matching and a procedure - to update a memory descriptor. -\layout Section - -Parallel File Servers -\begin_inset LatexCommand \label{sec:expfs} - -\end_inset - - -\layout Standard - -Figure\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:file} - -\end_inset - - illustrates the logical structure of a parallel file server. - In this case, the parallel server consists of four servers that stripe - application data across four disks. - We would like to present applications with the illusion that the file server - is a single entity. - We will assume that all of the processes that constitute the parallel server - have the same user id. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename file.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 196pt - lyxheight 147pt -\end_inset - - -\layout Caption - -Parallel File Server -\begin_inset LatexCommand \label{fig:file} - -\end_inset - - -\end_inset - - -\layout Standard - -When an application establishes a connection to the parallel file server, - it will allocate a Portal and access control list entry for communicating - with the server. - The access control list entry will include the Portal and match any process - in the parallel file server's, so all of the file server processes will - have access to the portal. - The Portal information and access control entry will be sent to the file - server at this time. - If the application and server need to have multiple, concurrent I/O operations, - they can use additional portals or match entries to keep the operations - from interfering with one another. -\layout Standard - -When an application initiates an I/O operation, it first builds a memory - descriptor that describes the memory region involved in the operation. - This memory descriptor will enable the appropriate operation (put for read - operations and get for write operations) and enable the use of remote offsets - (this lets the servers decide where their data should be placed in the - memory region). - After creating the memory descriptor and linking it into the appropriate - Portal entry, the application sends a read or write request (using -\emph on -PtlPut -\emph default -) to one of the file server processes. - The file server processes can then use put or get operations with the appropria -te offsets to fill or retrieve the contents of the application's buffer. - To know when the operation has completed, the application can add an event - queue to the memory descriptor and add up the lengths of the remote operations - until the sum is the size of the requested I/O operation. -\layout Section - -Dealing with Dropped Requests -\begin_inset LatexCommand \label{sec:exdrop} - -\end_inset - - -\layout Standard - -If a process does not anticipate unexpected requests, they will be discarded. - Applications using the Portals API can query the dropped count for the - interface to determine the number of requests that have been dropped (see - Section\SpecialChar ~ - -\begin_inset LatexCommand \ref{sec:nistatus} - -\end_inset - -). - While this approach minimizes resource consumption, it does not provide - information that might be critical in debugging the implementation of a - higher level protocol. -\layout Standard - -To keep track of more information about dropped requests, we use a memory - descriptor that truncates each incoming request to zero bytes and logs - the -\begin_inset Quotes eld -\end_inset - -dropped -\begin_inset Quotes erd -\end_inset - - operations in an event queue. - Note that the operations are not dropped in the Portals sense, because - the operation succeeds. -\layout Standard - -The following code fragment illustrates an implementation of this approach. - In this case, we assume that a thread is launched to execute the function - -\family typewriter -watch_drop -\family default -. - This code starts by building an event queue to log truncated operations - and a memory descriptor to truncate the incoming requests. - This example only captures -\begin_inset Quotes eld -\end_inset - -dropped -\begin_inset Quotes erd -\end_inset - - requests for a single portal. - In a more realistic situation, the memory descriptor would be appended - to the match list for every portal. - We also assume that the thread is capable of keeping up with the -\begin_inset Quotes eld -\end_inset - -dropped -\begin_inset Quotes erd -\end_inset - - requests. - If this is not the case, we could use a finite threshold on the memory - descriptor to capture the first few dropped requests. -\layout LyX-Code - - -\size small -#include -\newline -#include -\newline -#include -\newline - -\newline -#define DROP_SIZE 32 /* number of dropped requests to track */ -\newline - -\newline -int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) { -\newline - ptl_handle_eq_t drop_events; -\newline - ptl_event_t event; -\newline - ptl_handle_md_t drop_em; -\newline - ptl_md_t drop_desc; -\newline - ptl_process_id_t any_proc; -\newline - ptl_handle_me_t match_any; -\newline - -\newline - /* create the event queue */ -\newline - if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) { -\newline - fprintf( stderr, "Couldn't create the event queue -\backslash -n" ); -\newline - exit( 1 ); -\newline - } -\newline - -\newline - /* build a match entry */ -\newline - any_proc.nid = PTL_ID_ANY; -\newline - any_proc.pid = PTL_ID_ANY; -\newline - PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN, -\newline - &match_any ); -\newline - -\newline - /* create the memory descriptor */ -\newline - drop_desc.start = NULL; -\newline - drop_desc.length = 0; -\newline - drop_desc.threshold = PTL_MD_THRESH_INF; -\newline - drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE; -\newline - drop_desc.user_ptr = NULL; -\newline - drop_desc.eventq = drop_events; -\newline - if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) { -\newline - fprintf( stderr, "Couldn't create the memory descriptor -\backslash -n" ); -\newline - exit( 1 ); -\newline - } -\newline - -\newline - /* watch for "dropped" requests */ -\newline - while( 1 ) { -\newline - if( PtlEQWait( drop_events, &event ) != PTL_OK ) break; -\newline - fprintf( stderr, "Dropped request from gid = event.initiator.gid, - event.initiator.rid ); -\newline - } -\newline -} -\layout Section - -Message Transmission in MPI -\begin_inset LatexCommand \label{sec:exmpi} - -\end_inset - - -\layout Standard - -We conclude this section with a fairly extensive example that describes - an approach to implementing message transmission for MPI. - Like many MPI implementations, we distinguish two message transmission - protocols: a short message protocol and a long message protocol. - We use the constant -\family typewriter -MPI_LONG_LENGTH -\family default - to determine the size of a long message. -\layout Standard - -For small messages, the sender simply sends the message and presumes that - the message will be received (i.e., the receiver has allocated a memory region - to receive the message body). - For large messages, the sender also sends the message, but does not presume - that the message body will be saved. - Instead, the sender builds a memory descriptor for the message and enables - get operations on this descriptor. - If the target does not save the body of the message, it will record an - event for the put operation. - When the process later issues a matching MPI receive, it will perform a - get operation to retrieve the body of the message. -\layout Standard - -To facilitate receive side matching based on the protocol, we use the most - significant bit in the match bits to indicate the protocol: 1 for long - messages and 0 for short messages. -\layout Standard - -The following code presents a function that implements the send side of - the protocol. - The global variable -\family typewriter -EndGet -\family default - is the last match entry attached to the Portal index used for posting long - messages. - This entry does not match any incoming requests (i.e., the memory descriptor - rejects all get operations) and is built during initialization of the MPI - library. - The other global variable, -\family typewriter -MPI_NI -\family default -, is a handle for the network interface used by the MPI implementation. -\layout LyX-Code - - -\size small -extern ptl_handle_me_t EndGet; -\newline -extern ptl_handle_ni_t MPI_NI; -\newline - -\newline -void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq, -\newline - ptl_process_id target, ptl_match_bits_t match ) -\newline -{ -\newline - ptl_handle_md_t send_handle; -\newline - ptl_md_t mem_desc; -\newline - ptl_ack_req_t want_ack; -\newline - -\newline - mem_desc.start = buf; -\newline - mem_desc.length = len; -\newline - mem_desc.threshold = 1; -\newline - mem_desc.options = PTL_MD_GET_OP; -\newline - mem_desc.user_ptr = data; -\newline - mem_desc.eventq = eventq; -\newline - -\newline - if( len >= MPI_LONG_LENGTH ) { -\newline - ptl_handle_me_t me_handle; -\newline - -\newline - /* add a match entry to the end of the get list */ -\newline - PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet, - &me_handle ); -\newline - PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL ); -\newline - -\newline - /* we want an ack for long messages */ -\newline - want_ack = PTL_ACK_REQ; -\newline - -\newline - /* set the protocol bit to indicate that this is a long message - */ -\newline - match |= 1<<63; -\newline - } else { -\newline - /* we don't want an ack for short messages */ -\newline - want_ack = PTL_ACK_REQ; -\newline - -\newline - /* set the protocol bit to indicate that this is a short message - */ -\newline - match &= ~(1<<63); -\newline - } -\newline - -\newline - /* create a memory descriptor and send it */ -\newline - PtlMDBind( MPI_NI, mem_desc, &send_handle ); -\newline - PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match, - 0 ); -\newline -} -\layout Standard - -The -\emph on -MPISend -\emph default - function returns as soon as the message has been scheduled for transmission. - The event queue argument, -\family typewriter -eventq -\family default -, can be used to determine the disposition of the message. - Assuming that -\family typewriter -eventq -\family default - is not -\family typewriter -PTL_EQ_NONE -\family default -, a -\family typewriter -PTL_EVENT_SENT -\family default - event will be recorded for each message as the message is transmitted. - For small messages, this is the only event that will be recorded in -\family typewriter -eventq -\family default -. - In contrast, long messages include an explicit request for an acknowledgement. - If the -\family typewriter -target -\family default - process has posted a matching receive, the acknowledgement will be sent - as the message is received. - If a matching receive has not been posted, the message will be discarded - and no acknowledgement will be sent. - When the -\family typewriter -target -\family default - process later issues a matching receive, the receive will be translated - into a get operation and a -\family typewriter -PTL_EVENT_GET -\family default - event will be recorded in -\family typewriter -eventq -\family default -. -\layout Standard - -Figure\SpecialChar ~ - -\begin_inset LatexCommand \ref{fig:mpi} - -\end_inset - - illustrates the organization of the match list used for receiving MPI messages. - The initial entries (not shown in this figure) would be used to match the - MPI receives that have been preposted by the application. - The preposted receives are followed by a match entry, -\emph on -RcvMark -\emph default -, that marks the boundary between preposted receives and the memory descriptors - used for -\begin_inset Quotes eld -\end_inset - -unexpected -\begin_inset Quotes erd -\end_inset - - messages. - The -\emph on -RcvMark -\emph default - entry is followed by a small collection of match entries that match unexpected - -\begin_inset Quotes eld -\end_inset - -short -\begin_inset Quotes erd -\end_inset - - messages, i.e., messages that have a 0 in the most significant bit of their - match bits. - The memory descriptors associated with these match entries will append - the incoming message to the associated memory descriptor and record an - event in an event queue for unexpected messages. - The unexpected short message matching entries are followed by a match entry - that will match messages that were not matched by the preceding match entries, - i.e., the unexpected long messages. - The memory descriptor associated with this match entry truncates the message - body and records an event in the event queue for unexpected messages. - Note that of the memory descriptors used for unexpected messages share - a common event queue. - This makes it possible to process the unexpected messages in the order - in which they arrived, regardless of. -\layout Standard - - -\begin_inset Float figure -placement htbp -wide false -collapsed false - -\layout Standard -\align center - -\begin_inset Graphics FormatVersion 1 - filename mpi.eps - display color - size_type 0 - rotateOrigin center - lyxsize_type 1 - lyxwidth 389pt - lyxheight 284pt -\end_inset - - -\layout Caption - -Message Reception in MPI -\begin_inset LatexCommand \label{fig:mpi} - -\end_inset - - -\end_inset - - -\layout Standard - -When the local MPI process posts an MPI receive, we must first search the - events unexpected message queue to see if a matching message has already - arrived. - If no matching message is found, a match entry for the receive is inserted - before the -\emph on -RcvMark -\emph default - entry--after the match entries for all of the previously posted receives - and before the match entries for the unexpected messages. - This ensures that preposted receives are matched in the order that they - were posted (a requirement of MPI). - -\layout Standard - -While this strategy respects the temporal semantics of MPI, it introduces - a race condition: a matching message might arrive after the events in the - unexpected message queue have been searched, but before the match entry - for the receive has been inserted in the match list. - -\layout Standard - -To avoid this race condition we start by setting the -\family typewriter -threshold -\family default - of the memory descriptor to 0, making the descriptor inactive. - We then insert the match entry into the match list and proceed to search - the events in the unexpected message queue. - A matching message that arrives as we are searching the unexpected message - queue will not be accepted by the memory descriptor and, if not matched - by an earlier match list element, will add an event to the unexpected message - queue. - After searching the events in the unexpected message queue, we update the - memory descriptor, setting the threshold to 1 to activate the memory descriptor. - This update is predicated by the condition that the unexpected message - queue is empty. - We repeat the process of searching the unexpected message queue until the - update succeeds. -\layout Standard - -The following code fragment illustrates this approach. - Because events must be removed from the unexpected message queue to be - examined, this code fragment assumes the existence of a user managed event - list, -\family typewriter -Rcvd -\family default -, for the events that have already been removed from the unexpected message - queue. - In an effort to keep the example focused on the basic protocol, we have - omitted the code that would be needed to manage the memory descriptors - used for unexpected short messages. - In particular, we simply leave messages in these descriptors until they - are received by the application. - In a robust implementation, we would introduce code to ensure that short - unexpected messages are removed from these memory descriptors so that they - can be re-used. -\layout LyX-Code - - -\size small -extern ptl_handle_eq_t UnexpQueue; -\newline -extern ptl_handle_me_t RcvMark; -\newline -extern ptl_handle_me_t ShortMatch; -\newline - -\newline -typedef struct event_list_tag { -\newline - ptl_event_t event; -\newline - struct event_list_tag* next; -\newline -} event_list; -\newline - -\newline -extern event_list Rcvd; -\newline - -\newline -void AppendRcvd( ptl_event_t event ) -\newline -{ -\newline - /* append an event onto the Rcvd list */ -\newline -} -\newline - -\newline -int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi -ts_t match, -\newline - ptl_match_bits_t ignore, ptl_event_t *event ) -\newline -{ -\newline - /* Search the Rcvd event queue, looking for a message that matches the - requested message. -\newline - * If one is found, remove the event from the Rcvd list and return it. - */ -\newline -} -\newline - -\newline -typedef enum { RECEIVED, POSTED } receive_state; -\newline - -\newline -receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event, - ptl_md_t md_buf ) -\newline -{ -\newline - ptl_md_t md_buf; -\newline - ptl_handle_me_t me_handle; -\newline - -\newline - if( event.rlength >= MPI_LONG_LENGTH ) { -\newline - PtlMDBind( MPI_NI, md_buf, &md_handle ); -\newline - PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX, - md_handle ); -\newline - return POSTED; -\newline - } else { -\newline - /* copy the message */ -\newline - if( event.mlength < *length ) *length = event.mlength; -\newline - memcpy( buf, (char*)event.md_desc.start+event.offset, *length ); -\newline - return RECEIVED; -\newline - } -\newline -} -\newline - -\newline -receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle -_eq_t eventq, -\newline - ptl_process_id_t sender, ptl_match_bits_t match, - ptl_match_bits_t ignore ) -\newline -{ -\newline - ptl_md_t md_buf; -\newline - ptl_handle_md_t md_handle; -\newline - ptl_handle_me_t me_handle; -\newline - ptl_event_t event; -\newline - -\newline - /* build a memory descriptor for the receive */ -\newline - md_buf.start = buf; -\newline - md_buf.length = *len; -\newline - md_buf.threshold = 0; /* temporarily disabled */ -\newline - md_buf.options = PTL_MD_PUT_OP; -\newline - md_buf.user_ptr = MPI_data; -\newline - md_buf.eventq = eventq; -\newline - -\newline - /* see if we have already received the message */ -\newline - if( SearchRcvd(buf, len, sender, match, ignore, &event) ) -\newline - return CopyMsg( buf, len, event, md_buf ); -\newline - -\newline - /* create the match entry and attach the memory descriptor */ -\newline - PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark, - &me_handle); -\newline - PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle ); -\newline - -\newline - md_buf.threshold = 1; -\newline - do -\newline - if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) { -\newline - if( MPIMatch(event, match, ignore, sender) ) { -\newline - return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset, - md_buf ); -\newline - } else { -\newline - AppendRcvd( event ); -\newline - } -\newline - } -\newline - while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE - ); -\newline - return POSTED; -\newline -} -\layout Chapter* - -Acknowledgments -\layout Standard - -Several people have contributed to the philosophy, design, and implementation - of the Portals message passing architecture as it has evolved. - We acknowledge the following people for their contributions: Al Audette, - Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike - Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke, - Dave van Dresser, Lee Ward, and Stephen Wheat. - -\layout Standard - - -\begin_inset LatexCommand \BibTeX[ieee]{portals3} - -\end_inset - - -\the_end diff --git a/lustre/portals/doc/put.fig b/lustre/portals/doc/put.fig deleted file mode 100644 index 5235b6d..0000000 --- a/lustre/portals/doc/put.fig +++ /dev/null @@ -1,32 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -6 1350 900 2175 1200 -4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001 -4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001 --6 -2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 2700 1275 2700 1725 -2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 - 0 0 1.00 60.00 120.00 - 900 525 2700 1200 -2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 - 0 300 1200 300 1200 2250 0 2250 0 300 -2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 - 2400 300 3600 300 3600 2250 2400 2250 2400 300 -2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2 - 0 0 1.00 60.00 120.00 - 2699 1788 899 1938 -4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001 -4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001 -4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001 -4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001 -4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 -4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 diff --git a/lustre/portals/include/.cvsignore b/lustre/portals/include/.cvsignore deleted file mode 100644 index 94d3790..0000000 --- a/lustre/portals/include/.cvsignore +++ /dev/null @@ -1,6 +0,0 @@ -config.h -stamp-h -stamp-h1 -stamp-h.in -Makefile -Makefile.in diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am deleted file mode 100644 index 2b3eb8c..0000000 --- a/lustre/portals/include/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = linux portals - -EXTRA_DIST = cygwin-ioctl.h diff --git a/lustre/portals/include/cygwin-ioctl.h b/lustre/portals/include/cygwin-ioctl.h deleted file mode 100644 index 8a33957..0000000 --- a/lustre/portals/include/cygwin-ioctl.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * linux/ioctl.h for Linux by H.H. Bergman. - */ - -#ifndef _ASMI386_IOCTL_H -#define _ASMI386_IOCTL_H - -/* ioctl command encoding: 32 bits total, command in lower 16 bits, - * size of the parameter structure in the lower 14 bits of the - * upper 16 bits. - * Encoding the size of the parameter structure in the ioctl request - * is useful for catching programs compiled with old versions - * and to avoid overwriting user space outside the user buffer area. - * The highest 2 bits are reserved for indicating the ``access mode''. - * NOTE: This limits the max parameter size to 16kB -1 ! - */ - -/* - * The following is for compatibility across the various Linux - * platforms. The i386 ioctl numbering scheme doesn't really enforce - * a type field. De facto, however, the top 8 bits of the lower 16 - * bits are indeed used as a type field, so we might just as well make - * this explicit here. Please be sure to use the decoding macros - * below from now on. - */ -#undef _IO -#undef _IOR -#undef _IOW -#undef _IOC -#undef IOC_IN -#undef IOC_OUT - -#define _IOC_NRBITS 8 -#define _IOC_TYPEBITS 8 -#define _IOC_SIZEBITS 14 -#define _IOC_DIRBITS 2 - -#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) -#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) -#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) -#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) - -#define _IOC_NRSHIFT 0 -#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) -#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) -#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) - -/* - * Direction bits. - */ -#define _IOC_NONE 0U -#define _IOC_WRITE 1U -#define _IOC_READ 2U - -#define _IOC(dir,type,nr,size) \ - (((dir) << _IOC_DIRSHIFT) | \ - ((type) << _IOC_TYPESHIFT) | \ - ((nr) << _IOC_NRSHIFT) | \ - ((size) << _IOC_SIZESHIFT)) - -/* used to create numbers */ -#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) -#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) -#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) -#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) - -/* used to decode ioctl numbers.. */ -#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) -#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) -#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) - -/* ...and for the drivers/sound files... */ - -#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) -#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) -#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) -#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) -#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) - -#endif /* _ASMI386_IOCTL_H */ diff --git a/lustre/portals/include/linux/.cvsignore b/lustre/portals/include/linux/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/lustre/portals/include/linux/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am deleted file mode 100644 index 3c28c6e8..0000000 --- a/lustre/portals/include/linux/Makefile.am +++ /dev/null @@ -1,4 +0,0 @@ -linuxdir = $(includedir)/linux - -EXTRA_DIST = kp30.h kpr.h libcfs.h lustre_list.h portals_compat25.h \ - portals_lib.h diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h deleted file mode 100644 index 85284ce..0000000 --- a/lustre/portals/include/linux/kp30.h +++ /dev/null @@ -1,748 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _KP30_INCLUDED -#define _KP30_INCLUDED - -#include -#define PORTAL_DEBUG - -#ifdef __KERNEL__ -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define schedule_work schedule_task -#define prepare_work(wq,cb,cbdata) \ -do { \ - INIT_TQUEUE((wq), 0, 0); \ - PREPARE_TQUEUE((wq), (cb), (cbdata)); \ -} while (0) - -#define PageUptodate Page_Uptodate -#define our_recalc_sigpending(current) recalc_sigpending(current) -#define num_online_cpus() smp_num_cpus -static inline void our_cond_resched(void) -{ - if (current->need_resched) - schedule (); -} -#define work_struct_t struct tq_struct - -#else - -#define prepare_work(wq,cb,cbdata) \ -do { \ - INIT_WORK((wq), (void *)(cb), (void *)(cbdata)); \ -} while (0) -#define wait_on_page wait_on_page_locked -#define our_recalc_sigpending(current) recalc_sigpending() -#define strtok(a,b) strpbrk(a, b) -static inline void our_cond_resched(void) -{ - cond_resched(); -} -#define work_struct_t struct work_struct - -#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ - -#ifdef PORTAL_DEBUG -extern void kportal_assertion_failed(char *expr, char *file, const char *func, - const int line); -#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ - __FUNCTION__, __LINE__)) -#define LASSERTF(cond, fmt...) \ - do { \ - if (unlikely(!(cond))) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, D_EMERG, __FILE__,\ - __FUNCTION__,__LINE__, CDEBUG_STACK,\ - "ASSERTION(" #cond ") failed:" fmt);\ - LBUG(); \ - } \ - } while (0) - -#else -#define LASSERT(e) -#define LASSERTF(cond, fmt...) do { } while (0) -#endif - -#ifdef CONFIG_SMP -#define LASSERT_SPIN_LOCKED(lock) LASSERT(spin_is_locked(lock)) -#else -#define LASSERT_SPIN_LOCKED(lock) do {} while(0) -#endif - -#ifdef __arch_um__ -#define LBUG_WITH_LOC(file, func, line) \ -do { \ - CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ - portals_debug_dumplog(); \ - portals_run_lbug_upcall(file, func, line); \ - panic("LBUG"); \ -} while (0) -#else -#define LBUG_WITH_LOC(file, func, line) \ -do { \ - CEMERG("LBUG\n"); \ - portals_debug_dumpstack(NULL); \ - portals_debug_dumplog(); \ - portals_run_lbug_upcall(file, func, line); \ - set_task_state(current, TASK_UNINTERRUPTIBLE); \ - schedule(); \ -} while (0) -#endif /* __arch_um__ */ - -#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__) - -/* - * Memory - */ -#ifdef PORTAL_DEBUG -extern atomic_t portal_kmemory; - -# define portal_kmem_inc(ptr, size) \ -do { \ - atomic_add(size, &portal_kmemory); \ -} while (0) - -# define portal_kmem_dec(ptr, size) do { \ - atomic_sub(size, &portal_kmemory); \ -} while (0) - -#else -# define portal_kmem_inc(ptr, size) do {} while (0) -# define portal_kmem_dec(ptr, size) do {} while (0) -#endif /* PORTAL_DEBUG */ - -#define PORTAL_VMALLOC_SIZE 16384 - -#define PORTAL_ALLOC_GFP(ptr, size, mask) \ -do { \ - LASSERT(!in_interrupt() || \ - (size <= PORTAL_VMALLOC_SIZE && mask == GFP_ATOMIC)); \ - if ((size) > PORTAL_VMALLOC_SIZE) \ - (ptr) = vmalloc(size); \ - else \ - (ptr) = kmalloc((size), (mask)); \ - if ((ptr) == NULL) { \ - CERROR("PORTALS: out of memory at %s:%d (tried to alloc '"\ - #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));\ - CERROR("PORTALS: %d total bytes allocated by portals\n", \ - atomic_read(&portal_kmemory)); \ - } else { \ - portal_kmem_inc((ptr), (size)); \ - memset((ptr), 0, (size)); \ - } \ - CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p (tot %d).\n", \ - (int)(size), (ptr), atomic_read (&portal_kmemory)); \ -} while (0) - -#define PORTAL_ALLOC(ptr, size) \ - PORTAL_ALLOC_GFP(ptr, size, GFP_NOFS) - -#define PORTAL_ALLOC_ATOMIC(ptr, size) \ - PORTAL_ALLOC_GFP(ptr, size, GFP_ATOMIC) - -#define PORTAL_FREE(ptr, size) \ -do { \ - int s = (size); \ - if ((ptr) == NULL) { \ - CERROR("PORTALS: free NULL '" #ptr "' (%d bytes) at " \ - "%s:%d\n", s, __FILE__, __LINE__); \ - break; \ - } \ - if (s > PORTAL_VMALLOC_SIZE) \ - vfree(ptr); \ - else \ - kfree(ptr); \ - portal_kmem_dec((ptr), s); \ - CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ - s, (ptr), atomic_read(&portal_kmemory)); \ -} while (0) - -/* ------------------------------------------------------------------- */ - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - -#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x) -#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x) - -#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x)) -#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) - -#define PORTAL_MODULE_USE MOD_INC_USE_COUNT -#define PORTAL_MODULE_UNUSE MOD_DEC_USE_COUNT -#else - -#define PORTAL_SYMBOL_REGISTER(x) -#define PORTAL_SYMBOL_UNREGISTER(x) - -#define PORTAL_SYMBOL_GET(x) symbol_get(x) -#define PORTAL_SYMBOL_PUT(x) symbol_put(x) - -#define PORTAL_MODULE_USE try_module_get(THIS_MODULE) -#define PORTAL_MODULE_UNUSE module_put(THIS_MODULE) - -#endif - -/******************************************************************************/ - -#ifdef PORTALS_PROFILING -#define prof_enum(FOO) PROF__##FOO -enum { - prof_enum(our_recvmsg), - prof_enum(our_sendmsg), - prof_enum(socknal_recv), - prof_enum(lib_parse), - prof_enum(conn_list_walk), - prof_enum(memcpy), - prof_enum(lib_finalize), - prof_enum(pingcli_time), - prof_enum(gmnal_send), - prof_enum(gmnal_recv), - MAX_PROFS -}; - -struct prof_ent { - char *str; - /* hrmph. wrap-tastic. */ - u32 starts; - u32 finishes; - cycles_t total_cycles; - cycles_t start; - cycles_t end; -}; - -extern struct prof_ent prof_ents[MAX_PROFS]; - -#define PROF_START(FOO) \ - do { \ - struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ - pe->starts++; \ - pe->start = get_cycles(); \ - } while (0) - -#define PROF_FINISH(FOO) \ - do { \ - struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ - pe->finishes++; \ - pe->end = get_cycles(); \ - pe->total_cycles += (pe->end - pe->start); \ - } while (0) -#else /* !PORTALS_PROFILING */ -#define PROF_START(FOO) do {} while(0) -#define PROF_FINISH(FOO) do {} while(0) -#endif /* PORTALS_PROFILING */ - -/* debug.c */ -void portals_debug_dumpstack(struct task_struct *tsk); -void portals_run_upcall(char **argv); -void portals_run_lbug_upcall(char * file, const char *fn, const int line); -void portals_debug_dumplog(void); -int portals_debug_init(unsigned long bufsize); -int portals_debug_cleanup(void); -int portals_debug_clear_buffer(void); -int portals_debug_mark_buffer(char *text); -int portals_debug_set_daemon(unsigned int cmd, unsigned int length, - char *file, unsigned int size); -__s32 portals_debug_copy_to_user(char *buf, unsigned long len); -#if (__GNUC__) -/* Use the special GNU C __attribute__ hack to have the compiler check the - * printf style argument string against the actual argument count and - * types. - */ -#ifdef printf -# warning printf has been defined as a macro... -# undef printf -#endif -void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, - char *format, ...) - __attribute__ ((format (printf, 7, 8))); -#else -void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, - const char *format, ...); -#endif /* __GNUC__ */ -void portals_debug_set_level(unsigned int debug_level); - -# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) -# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) -# define time(a) CURRENT_TIME - -extern void kportal_daemonize (char *name); -extern void kportal_blockallsigs (void); - -#else /* !__KERNEL__ */ -# include -# include -#ifndef __CYGWIN__ -# include -#else -# include -#endif -# include -# include -# include -# ifndef DEBUG_SUBSYSTEM -# define DEBUG_SUBSYSTEM S_UNDEFINED -# endif -# ifdef PORTAL_DEBUG -# undef NDEBUG -# include -# define LASSERT(e) assert(e) -# define LASSERTF(cond, args...) \ -do { \ - if (!(cond)) \ - CERROR(args); \ - assert(cond); \ -} while (0) -# else -# define LASSERT(e) -# define LASSERTF(cond, args...) do { } while (0) -# endif -# define printk(format, args...) printf (format, ## args) -# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); -# define PORTAL_FREE(a, b) do { free(a); } while (0); -void portals_debug_dumplog(void); -# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ - printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ - (subsys), (mask), (long)time(0), file, fn, line, \ - getpid() , stack, ## a); - -#undef CWARN -#undef CERROR -#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) -#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) -#endif - -/* support decl needed both by kernel and liblustre */ -char *portals_nid2str(int nal, ptl_nid_t nid, char *str); -char *portals_id2str(int nal, ptl_process_id_t nid, char *str); - -#ifndef CURRENT_TIME -# define CURRENT_TIME time(0) -#endif - -/******************************************************************************/ -/* Light-weight trace - * Support for temporary event tracing with minimal Heisenberg effect. */ -#define LWT_SUPPORT 0 - -#define LWT_MEMORY (16<<20) - -#if !KLWT_SUPPORT -# if defined(__KERNEL__) -# if !defined(BITS_PER_LONG) -# error "BITS_PER_LONG not defined" -# endif -# elif !defined(__WORDSIZE) -# error "__WORDSIZE not defined" -# else -# define BITS_PER_LONG __WORDSIZE -# endif - -/* kernel hasn't defined this? */ -typedef struct { - long long lwte_when; - char *lwte_where; - void *lwte_task; - long lwte_p1; - long lwte_p2; - long lwte_p3; - long lwte_p4; -# if BITS_PER_LONG > 32 - long lwte_pad; -# endif -} lwt_event_t; -#endif /* !KLWT_SUPPORT */ - -#if LWT_SUPPORT -# ifdef __KERNEL__ -# if !KLWT_SUPPORT - -typedef struct _lwt_page { - struct list_head lwtp_list; - struct page *lwtp_page; - lwt_event_t *lwtp_events; -} lwt_page_t; - -typedef struct { - int lwtc_current_index; - lwt_page_t *lwtc_current_page; -} lwt_cpu_t; - -extern int lwt_enabled; -extern lwt_cpu_t lwt_cpus[]; - -/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. - * This stuff is meant for finding specific problems; it never stays in - * production code... */ - -#define LWTSTR(n) #n -#define LWTWHERE(f,l) f ":" LWTSTR(l) -#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) - -#define LWT_EVENT(p1, p2, p3, p4) \ -do { \ - unsigned long flags; \ - lwt_cpu_t *cpu; \ - lwt_page_t *p; \ - lwt_event_t *e; \ - \ - if (lwt_enabled) { \ - local_irq_save (flags); \ - \ - cpu = &lwt_cpus[smp_processor_id()]; \ - p = cpu->lwtc_current_page; \ - e = &p->lwtp_events[cpu->lwtc_current_index++]; \ - \ - if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \ - cpu->lwtc_current_page = \ - list_entry (p->lwtp_list.next, \ - lwt_page_t, lwtp_list); \ - cpu->lwtc_current_index = 0; \ - } \ - \ - e->lwte_when = get_cycles(); \ - e->lwte_where = LWTWHERE(__FILE__,__LINE__); \ - e->lwte_task = current; \ - e->lwte_p1 = (long)(p1); \ - e->lwte_p2 = (long)(p2); \ - e->lwte_p3 = (long)(p3); \ - e->lwte_p4 = (long)(p4); \ - \ - local_irq_restore (flags); \ - } \ -} while (0) - -#endif /* !KLWT_SUPPORT */ - -extern int lwt_init (void); -extern void lwt_fini (void); -extern int lwt_lookup_string (int *size, char *knlptr, - char *usrptr, int usrsize); -extern int lwt_control (int enable, int clear); -extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, - void *user_ptr, int user_size); -# else /* __KERNEL__ */ -# define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ -# endif /* __KERNEL__ */ -#endif /* LWT_SUPPORT */ - -struct portals_device_userstate -{ - int pdu_memhog_pages; - struct page *pdu_memhog_root_page; -}; - -#include - -/* - * USER LEVEL STUFF BELOW - */ - -#define PORTAL_IOCTL_VERSION 0x00010007 -#define PING_SYNC 0 -#define PING_ASYNC 1 - -struct portal_ioctl_hdr { - __u32 ioc_len; - __u32 ioc_version; -}; - -struct portals_debug_ioctl_data -{ - struct portal_ioctl_hdr hdr; - unsigned int subs; - unsigned int debug; -}; - -#define PORTAL_IOC_INIT(data) \ -do { \ - memset(&data, 0, sizeof(data)); \ - data.ioc_version = PORTAL_IOCTL_VERSION; \ - data.ioc_len = sizeof(data); \ -} while (0) - -/* FIXME check conflict with lustre_lib.h */ -#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) - -static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) -{ - int len = sizeof(*data); - len += size_round(data->ioc_inllen1); - len += size_round(data->ioc_inllen2); - return len; -} - -static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) -{ - if (data->ioc_len > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); - return 1; - } - if (data->ioc_inllen1 > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); - return 1; - } - if (data->ioc_inllen2 > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); - return 1; - } - if (data->ioc_inlbuf1 && !data->ioc_inllen1) { - CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); - return 1; - } - if (data->ioc_inlbuf2 && !data->ioc_inllen2) { - CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); - return 1; - } - if (data->ioc_pbuf1 && !data->ioc_plen1) { - CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); - return 1; - } - if (data->ioc_pbuf2 && !data->ioc_plen2) { - CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); - return 1; - } - if (data->ioc_plen1 && !data->ioc_pbuf1) { - CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); - return 1; - } - if (data->ioc_plen2 && !data->ioc_pbuf2) { - CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); - return 1; - } - if (portal_ioctl_packlen(data) != data->ioc_len ) { - CERROR ("PORTALS ioctl: packlen != ioc_len\n"); - return 1; - } - if (data->ioc_inllen1 && - data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { - CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); - return 1; - } - if (data->ioc_inllen2 && - data->ioc_bulk[size_round(data->ioc_inllen1) + - data->ioc_inllen2 - 1] != '\0') { - CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); - return 1; - } - return 0; -} - -#ifndef __KERNEL__ -static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, - int max) -{ - char *ptr; - struct portal_ioctl_data *overlay; - data->ioc_len = portal_ioctl_packlen(data); - data->ioc_version = PORTAL_IOCTL_VERSION; - - if (*pbuf && portal_ioctl_packlen(data) > max) - return 1; - if (*pbuf == NULL) { - *pbuf = malloc(data->ioc_len); - } - if (!*pbuf) - return 1; - overlay = (struct portal_ioctl_data *)*pbuf; - memcpy(*pbuf, data, sizeof(*data)); - - ptr = overlay->ioc_bulk; - if (data->ioc_inlbuf1) - LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); - if (data->ioc_inlbuf2) - LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); - if (portal_ioctl_is_invalid(overlay)) - return 1; - - return 0; -} -#else -#include - -/* buffer MUST be at least the size of portal_ioctl_hdr */ -static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) -{ - struct portal_ioctl_hdr *hdr; - struct portal_ioctl_data *data; - int err; - ENTRY; - - hdr = (struct portal_ioctl_hdr *)buf; - data = (struct portal_ioctl_data *)buf; - - err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); - if (err) - RETURN(err); - - if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { - CERROR("PORTALS: version mismatch kernel vs application\n"); - RETURN(-EINVAL); - } - - if (hdr->ioc_len + buf >= end) { - CERROR("PORTALS: user buffer exceeds kernel buffer\n"); - RETURN(-EINVAL); - } - - - if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { - CERROR("PORTALS: user buffer too small for ioctl\n"); - RETURN(-EINVAL); - } - - err = copy_from_user(buf, (void *)arg, hdr->ioc_len); - if (err) - RETURN(err); - - if (portal_ioctl_is_invalid(data)) { - CERROR("PORTALS: ioctl not correctly formatted\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen1) - data->ioc_inlbuf1 = &data->ioc_bulk[0]; - - if (data->ioc_inllen2) - data->ioc_inlbuf2 = &data->ioc_bulk[0] + - size_round(data->ioc_inllen1); - - RETURN(0); -} -#endif - -/* ioctls for manipulating snapshots 30- */ -#define IOC_PORTAL_TYPE 'e' -#define IOC_PORTAL_MIN_NR 30 - -#define IOC_PORTAL_PING _IOWR('e', 30, long) - -#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long) -#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long) -#define IOC_PORTAL_PANIC _IOWR('e', 34, long) -#define IOC_PORTAL_NAL_CMD _IOWR('e', 35, long) -#define IOC_PORTAL_GET_NID _IOWR('e', 36, long) -#define IOC_PORTAL_FAIL_NID _IOWR('e', 37, long) - -#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 39, long) -#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 40, long) -#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 41, long) -#define IOC_PORTAL_MEMHOG _IOWR('e', 42, long) -#define IOC_PORTAL_MAX_NR 42 - -enum { - QSWNAL = 1, - SOCKNAL = 2, - GMNAL = 3, - /* 4 unused */ - TCPNAL = 5, - ROUTER = 6, - OPENIBNAL = 7, - IIBNAL = 8, - LONAL = 9, - RANAL = 10, - NAL_ENUM_END_MARKER -}; - -#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */ - -#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) - -#define NAL_CMD_REGISTER_PEER_FD 100 -#define NAL_CMD_CLOSE_CONNECTION 101 -#define NAL_CMD_REGISTER_MYNID 102 -#define NAL_CMD_PUSH_CONNECTION 103 -#define NAL_CMD_GET_CONN 104 -#define NAL_CMD_DEL_PEER 105 -#define NAL_CMD_ADD_PEER 106 -#define NAL_CMD_GET_PEER 107 -#define NAL_CMD_GET_TXDESC 108 -#define NAL_CMD_ADD_ROUTE 109 -#define NAL_CMD_DEL_ROUTE 110 -#define NAL_CMD_GET_ROUTE 111 -#define NAL_CMD_NOTIFY_ROUTER 112 -#define NAL_CMD_ADD_INTERFACE 113 -#define NAL_CMD_DEL_INTERFACE 114 -#define NAL_CMD_GET_INTERFACE 115 - - -enum { - DEBUG_DAEMON_START = 1, - DEBUG_DAEMON_STOP = 2, - DEBUG_DAEMON_PAUSE = 3, - DEBUG_DAEMON_CONTINUE = 4, -}; - - -enum cfg_record_type { - PORTALS_CFG_TYPE = 1, - LUSTRE_CFG_TYPE = 123, -}; - -typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); - -#ifdef __CYGWIN__ -# ifndef BITS_PER_LONG -# if (~0UL) == 0xffffffffUL -# define BITS_PER_LONG 32 -# else -# define BITS_PER_LONG 64 -# endif -# endif -#endif - -#if BITS_PER_LONG > 32 -# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#else -# define LI_POISON ((int)0x5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a) -#endif - -#if defined(__x86_64__) -# define LPU64 "%Lu" -# define LPD64 "%Ld" -# define LPX64 "%#Lx" -# define LPSZ "%lu" -# define LPSSZ "%ld" -#elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) -# define LPU64 "%Lu" -# define LPD64 "%Ld" -# define LPX64 "%#Lx" -# define LPSZ "%u" -# define LPSSZ "%d" -#elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) -# define LPU64 "%lu" -# define LPD64 "%ld" -# define LPX64 "%#lx" -# define LPSZ "%lu" -# define LPSSZ "%ld" -#endif -#ifndef LPU64 -# error "No word size defined" -#endif - -/* lustre_id output helper macros */ -#define DLID4 "%lu/%lu/%lu/%lu" - -#define OLID4(id) \ - (unsigned long)(id)->li_fid.lf_id, \ - (unsigned long)(id)->li_fid.lf_group, \ - (unsigned long)(id)->li_stc.u.e3s.l3s_ino, \ - (unsigned long)(id)->li_stc.u.e3s.l3s_gen - -#endif diff --git a/lustre/portals/include/linux/kpr.h b/lustre/portals/include/linux/kpr.h deleted file mode 100644 index 1127698..0000000 --- a/lustre/portals/include/linux/kpr.h +++ /dev/null @@ -1,176 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _KPR_H -#define _KPR_H - -# include /* for ptl_hdr_t */ - -/******************************************************************************/ -/* Kernel Portals Router interface */ - -typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback - -/* space for routing targets to stash "stuff" in a forwarded packet */ -typedef union { - long long _alignment; - void *_space[16]; /* scale with CPU arch */ -} kprfd_scratch_t; - -/* Kernel Portals Routing Forwarded message Descriptor */ -typedef struct { - struct list_head kprfd_list; /* stash in queues (routing target can use) */ - ptl_nid_t kprfd_target_nid; /* final destination NID */ - ptl_nid_t kprfd_gateway_nid; /* gateway NID */ - ptl_hdr_t *kprfd_hdr; /* header in wire byte order */ - int kprfd_nob; /* # payload bytes */ - int kprfd_niov; /* # payload frags */ - ptl_kiov_t *kprfd_kiov; /* payload fragments */ - void *kprfd_router_arg; /* originating NAL's router arg */ - kpr_fwd_callback_t kprfd_callback; /* completion callback */ - void *kprfd_callback_arg; /* completion callback arg */ - kprfd_scratch_t kprfd_scratch; /* scratchpad for routing targets */ -} kpr_fwd_desc_t; - -typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); -typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive); - -/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ -typedef const struct { - int kprni_nalid; /* NAL's id */ - void *kprni_arg; /* Arg to pass when calling into NAL */ - kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ - kpr_notify_t kprni_notify; /* NAL's notification entrypoint */ -} kpr_nal_interface_t; - -/* Router's routing interface (Kernel Portals Routing Router Interface) */ -typedef const struct { - /* register the calling NAL with the router and get back the handle for - * subsequent calls */ - int (*kprri_register) (kpr_nal_interface_t *nal_interface, - void **router_arg); - - /* ask the router to find a gateway that forwards to 'nid' and is a - * peer of the calling NAL; assume caller will send 'nob' bytes of - * payload there */ - int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob, - ptl_nid_t *gateway_nid); - - /* hand a packet over to the router for forwarding */ - kpr_fwd_t kprri_fwd_start; - - /* hand a packet back to the router for completion */ - void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, - int error); - - /* notify the router about peer state */ - void (*kprri_notify) (void *router_arg, ptl_nid_t peer, - int alive, time_t when); - - /* the calling NAL is shutting down */ - void (*kprri_shutdown) (void *router_arg); - - /* deregister the calling NAL with the router */ - void (*kprri_deregister) (void *router_arg); - -} kpr_router_interface_t; - -/* Convenient struct for NAL to stash router interface/args */ -typedef struct { - kpr_router_interface_t *kpr_interface; - void *kpr_arg; -} kpr_router_t; - -extern kpr_router_interface_t kpr_router_interface; - -static inline int -kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) -{ - int rc; - - router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); - if (router->kpr_interface == NULL) - return (-ENOENT); - - rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); - if (rc != 0) - router->kpr_interface = NULL; - - PORTAL_SYMBOL_PUT (kpr_router_interface); - return (rc); -} - -static inline int -kpr_routing (kpr_router_t *router) -{ - return (router->kpr_interface != NULL); -} - -static inline int -kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid) -{ - if (!kpr_routing (router)) - return (-ENETUNREACH); - - return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob, - gateway_nid)); -} - -static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, - int nob, int niov, ptl_kiov_t *kiov, - kpr_fwd_callback_t callback, void *callback_arg) -{ - fwd->kprfd_target_nid = nid; - fwd->kprfd_gateway_nid = nid; - fwd->kprfd_hdr = hdr; - fwd->kprfd_nob = nob; - fwd->kprfd_niov = niov; - fwd->kprfd_kiov = kiov; - fwd->kprfd_callback = callback; - fwd->kprfd_callback_arg = callback_arg; -} - -static inline void -kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) -{ - if (!kpr_routing (router)) - fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH); - else - router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); -} - -static inline void -kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) -{ - LASSERT (kpr_routing (router)); - router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); -} - -static inline void -kpr_notify (kpr_router_t *router, - ptl_nid_t peer, int alive, time_t when) -{ - if (!kpr_routing (router)) - return; - - router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when); -} - -static inline void -kpr_shutdown (kpr_router_t *router) -{ - if (kpr_routing (router)) - router->kpr_interface->kprri_shutdown (router->kpr_arg); -} - -static inline void -kpr_deregister (kpr_router_t *router) -{ - if (!kpr_routing (router)) - return; - router->kpr_interface->kprri_deregister (router->kpr_arg); - router->kpr_interface = NULL; -} - -#endif /* _KPR_H */ diff --git a/lustre/portals/include/linux/libcfs.h b/lustre/portals/include/linux/libcfs.h deleted file mode 100644 index d8f5349..0000000 --- a/lustre/portals/include/linux/libcfs.h +++ /dev/null @@ -1,410 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _LIBCFS_H -#define _LIBCFS_H - -#ifdef HAVE_ASM_TYPES_H -#include -#else -#include "types.h" -#endif - -#ifdef __KERNEL__ -# include -# include -#else -# include -# define do_gettimeofday(tv) gettimeofday(tv, NULL); -typedef unsigned long long cycles_t; -#endif - -#define PORTAL_DEBUG - -#ifndef offsetof -# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) -#endif - -#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) - -#ifndef __KERNEL__ -/* Userpace byte flipping */ -# include -# include -# define __swab16(x) bswap_16(x) -# define __swab32(x) bswap_32(x) -# define __swab64(x) bswap_64(x) -# define __swab16s(x) do {*(x) = bswap_16(*(x));} while (0) -# define __swab32s(x) do {*(x) = bswap_32(*(x));} while (0) -# define __swab64s(x) do {*(x) = bswap_64(*(x));} while (0) -# if __BYTE_ORDER == __LITTLE_ENDIAN -# define le16_to_cpu(x) (x) -# define cpu_to_le16(x) (x) -# define le32_to_cpu(x) (x) -# define cpu_to_le32(x) (x) -# define le64_to_cpu(x) (x) -# define cpu_to_le64(x) (x) -# else -# if __BYTE_ORDER == __BIG_ENDIAN -# define le16_to_cpu(x) bswap_16(x) -# define cpu_to_le16(x) bswap_16(x) -# define le32_to_cpu(x) bswap_32(x) -# define cpu_to_le32(x) bswap_32(x) -# define le64_to_cpu(x) bswap_64(x) -# define cpu_to_le64(x) bswap_64(x) -# else -# error "Unknown byte order" -# endif /* __BIG_ENDIAN */ -# endif /* __LITTLE_ENDIAN */ -#endif /* ! __KERNEL__ */ - -/* - * Debugging - */ -extern unsigned int portal_subsystem_debug; -extern unsigned int portal_stack; -extern unsigned int portal_debug; -extern unsigned int portal_printk; - -struct ptldebug_header { - __u32 ph_len; - __u32 ph_flags; - __u32 ph_subsys; - __u32 ph_mask; - __u32 ph_cpu_id; - __u32 ph_sec; - __u64 ph_usec; - __u32 ph_stack; - __u32 ph_pid; - __u32 ph_extern_pid; - __u32 ph_line_num; -} __attribute__((packed)); - -#define PH_FLAG_FIRST_RECORD 1 - -/* Debugging subsystems (32 bits, non-overlapping) */ -#define S_UNDEFINED 0x00000001 -#define S_MDC 0x00000002 -#define S_MDS 0x00000004 -#define S_OSC 0x00000008 -#define S_OST 0x00000010 -#define S_CLASS 0x00000020 -#define S_LOG 0x00000040 -#define S_LLITE 0x00000080 -#define S_RPC 0x00000100 -#define S_MGMT 0x00000200 -#define S_PORTALS 0x00000400 -#define S_NAL 0x00000800 /* ALL NALs */ -#define S_PINGER 0x00001000 -#define S_FILTER 0x00002000 -#define S_PTLBD 0x00004000 -#define S_ECHO 0x00008000 -#define S_LDLM 0x00010000 -#define S_LOV 0x00020000 -#define S_PTLROUTER 0x00040000 -#define S_COBD 0x00080000 -#define S_SM 0x00100000 -#define S_ASOBD 0x00200000 -#define S_CONFOBD 0x00400000 -#define S_LMV 0x00800000 -#define S_CMOBD 0x01000000 -/* If you change these values, please keep these files up to date... - * portals/utils/debug.c - * utils/lconf - */ - -/* Debugging masks (32 bits, non-overlapping) */ -#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ -#define D_INODE 0x00000002 -#define D_SUPER 0x00000004 -#define D_EXT2 0x00000008 /* anything from ext2_debug */ -#define D_MALLOC 0x00000010 /* print malloc, free information */ -#define D_CACHE 0x00000020 /* cache-related items */ -#define D_INFO 0x00000040 /* general information */ -#define D_IOCTL 0x00000080 /* ioctl related information */ -#define D_BLOCKS 0x00000100 /* ext2 block allocation */ -#define D_NET 0x00000200 /* network communications */ -#define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ -#define D_BUFFS 0x00000800 -#define D_OTHER 0x00001000 -#define D_DENTRY 0x00002000 -#define D_PORTALS 0x00004000 /* ENTRY/EXIT markers */ -#define D_PAGE 0x00008000 /* bulk page handling */ -#define D_DLMTRACE 0x00010000 -#define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ -#define D_EMERG 0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ -#define D_HA 0x00080000 /* recovery and failover */ -#define D_RPCTRACE 0x00100000 /* for distributed debugging */ -#define D_VFSTRACE 0x00200000 -#define D_READA 0x00400000 /* read-ahead */ -#define D_MMAP 0x00800000 -#define D_CONFIG 0x01000000 -/* If you change these values, please keep these files up to date... - * portals/utils/debug.c - * utils/lconf - */ - -#ifdef __KERNEL__ -# include /* THREAD_SIZE */ -#else -# ifndef THREAD_SIZE /* x86_64 has THREAD_SIZE in userspace */ -# define THREAD_SIZE 8192 -# endif -#endif - -#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) - -#ifdef __KERNEL__ -# ifdef __ia64__ -# define CDEBUG_STACK (THREAD_SIZE - \ - ((unsigned long)__builtin_dwarf_cfa() & \ - (THREAD_SIZE - 1))) -# else -# define CDEBUG_STACK (THREAD_SIZE - \ - ((unsigned long)__builtin_frame_address(0) & \ - (THREAD_SIZE - 1))) -# endif /* __ia64__ */ - -#define CHECK_STACK(stack) \ - do { \ - if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, D_WARNING, \ - __FILE__, __FUNCTION__, __LINE__, \ - (stack),"maximum lustre stack %u\n",\ - portal_stack = (stack)); \ - /*panic("LBUG");*/ \ - } \ - } while (0) -#else /* !__KERNEL__ */ -#define CHECK_STACK(stack) do { } while(0) -#define CDEBUG_STACK (0L) -#endif /* __KERNEL__ */ - -#if 1 -#define CDEBUG(mask, format, a...) \ -do { \ - CHECK_STACK(CDEBUG_STACK); \ - if (((mask) & (D_ERROR | D_EMERG | D_WARNING)) || \ - (portal_debug & (mask) && \ - portal_subsystem_debug & DEBUG_SUBSYSTEM)) \ - portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ - __FILE__, __FUNCTION__, __LINE__, \ - CDEBUG_STACK, format, ## a); \ -} while (0) - -#define CDEBUG_MAX_LIMIT 600 -#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \ -do { \ - static unsigned long cdebug_next = 0; \ - static int cdebug_count = 0, cdebug_delay = 1; \ - \ - CHECK_STACK(CDEBUG_STACK); \ - if (time_after(jiffies, cdebug_next)) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \ - __FUNCTION__, __LINE__, CDEBUG_STACK, \ - cdebug_format, ## a); \ - if (cdebug_count) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \ - __FILE__, __FUNCTION__, __LINE__, \ - 0, "skipped %d similar messages\n", \ - cdebug_count); \ - cdebug_count = 0; \ - } \ - if (time_after(jiffies, cdebug_next+(CDEBUG_MAX_LIMIT+10)*HZ))\ - cdebug_delay = cdebug_delay > 8 ? cdebug_delay/8 : 1; \ - else \ - cdebug_delay = cdebug_delay*2 >= CDEBUG_MAX_LIMIT*HZ ?\ - CDEBUG_MAX_LIMIT*HZ : cdebug_delay*2; \ - cdebug_next = jiffies + cdebug_delay; \ - } else { \ - portals_debug_msg(DEBUG_SUBSYSTEM, \ - portal_debug & ~(D_EMERG|D_ERROR|D_WARNING),\ - __FILE__, __FUNCTION__, __LINE__, \ - CDEBUG_STACK, cdebug_format, ## a); \ - cdebug_count++; \ - } \ -} while (0) - -#define CWARN(format, a...) CDEBUG_LIMIT(D_WARNING, format, ## a) -#define CERROR(format, a...) CDEBUG_LIMIT(D_ERROR, format, ## a) -#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) - -#define GOTO(label, rc) \ -do { \ - long GOTO__ret = (long)(rc); \ - CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \ - #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\ - (signed long)GOTO__ret); \ - goto label; \ -} while (0) - -#define RETURN(rc) \ -do { \ - typeof(rc) RETURN__ret = (rc); \ - CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ - (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\ - return RETURN__ret; \ -} while (0) - -#define ENTRY \ -do { \ - CDEBUG(D_TRACE, "Process entered\n"); \ -} while (0) - -#define EXIT \ -do { \ - CDEBUG(D_TRACE, "Process leaving\n"); \ -} while(0) -#else -#define CDEBUG(mask, format, a...) do { } while (0) -#define CWARN(format, a...) printk(KERN_WARNING format, ## a) -#define CERROR(format, a...) printk(KERN_ERR format, ## a) -#define CEMERG(format, a...) printk(KERN_EMERG format, ## a) -#define GOTO(label, rc) do { (void)(rc); goto label; } while (0) -#define RETURN(rc) return (rc) -#define ENTRY do { } while (0) -#define EXIT do { } while (0) -#endif - -/* initial pid */ -# if CRAY_PORTALS -/* - * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this - * is too big. - * - * 2) the implementation of ernal in cray portals further restricts the pid - * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns - * an error at nal init time for any pid outside this range. Other nals - * in cray portals don't have this restriction. - * */ -#define LUSTRE_PTL_PID 9 -# else -#define LUSTRE_PTL_PID 12345 -# endif - -#define LUSTRE_SRV_PTL_PID LUSTRE_PTL_PID - -#define PORTALS_CFG_VERSION 0x00010001; - -struct portals_cfg { - __u32 pcfg_version; - __u32 pcfg_command; - - __u32 pcfg_nal; - __u32 pcfg_flags; - - __u32 pcfg_gw_nal; - __u64 pcfg_nid; - __u64 pcfg_nid2; - __u64 pcfg_nid3; - __u32 pcfg_id; - __u32 pcfg_misc; - __u32 pcfg_fd; - __u32 pcfg_count; - __u32 pcfg_size; - __u32 pcfg_wait; - - __u32 pcfg_plen1; /* buffers in userspace */ - char *pcfg_pbuf1; - __u32 pcfg_plen2; /* buffers in userspace */ - char *pcfg_pbuf2; -}; - -#define PCFG_INIT(pcfg, cmd) \ -do { \ - memset(&pcfg, 0, sizeof(pcfg)); \ - pcfg.pcfg_version = PORTALS_CFG_VERSION; \ - pcfg.pcfg_command = (cmd); \ - \ -} while (0) - -typedef int (nal_cmd_handler_fn)(struct portals_cfg *, void *); -int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *arg); -int libcfs_nal_cmd(struct portals_cfg *pcfg); -void libcfs_nal_cmd_unregister(int nal); - -struct portal_ioctl_data { - __u32 ioc_len; - __u32 ioc_version; - __u64 ioc_nid; - __u64 ioc_nid2; - __u64 ioc_nid3; - __u32 ioc_count; - __u32 ioc_nal; - __u32 ioc_nal_cmd; - __u32 ioc_fd; - __u32 ioc_id; - - __u32 ioc_flags; - __u32 ioc_size; - - __u32 ioc_wait; - __u32 ioc_timeout; - __u32 ioc_misc; - - __u32 ioc_inllen1; - char *ioc_inlbuf1; - __u32 ioc_inllen2; - char *ioc_inlbuf2; - - __u32 ioc_plen1; /* buffers in userspace */ - char *ioc_pbuf1; - __u32 ioc_plen2; /* buffers in userspace */ - char *ioc_pbuf2; - - char ioc_bulk[0]; -}; - - -#ifdef __KERNEL__ - -#include - -struct libcfs_ioctl_handler { - struct list_head item; - int (*handle_ioctl)(struct portal_ioctl_data *data, - unsigned int cmd, unsigned long args); -}; - -#define DECLARE_IOCTL_HANDLER(ident, func) \ - struct libcfs_ioctl_handler ident = { \ - .item = LIST_HEAD_INIT(ident.item), \ - .handle_ioctl = func \ - } - -int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand); -int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand); - -#endif - -#ifdef __KERNEL__ -/* libcfs watchdogs */ -struct lc_watchdog; - -/* Just use the default handler (dumplog) */ -#define LC_WATCHDOG_DEFAULT_CB NULL - -/* Add a watchdog which fires after "time" milliseconds of delay. You have to - * touch it once to enable it. */ -struct lc_watchdog *lc_watchdog_add(int time, - void (*cb)(struct lc_watchdog *, - struct task_struct *, - void *), - void *data); - -/* Enables a watchdog and resets its timer. */ -void lc_watchdog_touch(struct lc_watchdog *lcw); - -/* Disable a watchdog; touch it to restart it. */ -void lc_watchdog_disable(struct lc_watchdog *lcw); - -/* Clean up the watchdog */ -void lc_watchdog_delete(struct lc_watchdog *lcw); - -/* Dump a debug log */ -void lc_watchdog_dumplog(struct lc_watchdog *lcw, - struct task_struct *tsk, - void *data); -#endif /* !__KERNEL__ */ - -#endif /* _LIBCFS_H */ diff --git a/lustre/portals/include/linux/lustre_list.h b/lustre/portals/include/linux/lustre_list.h deleted file mode 100644 index a218f2c..0000000 --- a/lustre/portals/include/linux/lustre_list.h +++ /dev/null @@ -1,246 +0,0 @@ -#ifndef _LUSTRE_LIST_H -#define _LUSTRE_LIST_H - -#ifdef __KERNEL__ -#include -#else -/* - * Simple doubly linked list implementation. - * - * Some of the internal functions ("__xxx") are useful when - * manipulating whole lists rather than single entries, as - * sometimes we already know the next/prev entries and we can - * generate better code by using them directly rather than - * using the generic single-entry routines. - */ - -#define prefetch(a) ((void)a) - -struct list_head { - struct list_head *next, *prev; -}; - -typedef struct list_head list_t; - -#define LIST_HEAD_INIT(name) { &(name), &(name) } - -#define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) - -#define INIT_LIST_HEAD(ptr) do { \ - (ptr)->next = (ptr); (ptr)->prev = (ptr); \ -} while (0) - -/* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static inline void __list_add(struct list_head * new, - struct list_head * prev, - struct list_head * next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} - -/** - * list_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -static inline void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} - -/** - * list_add_tail - add a new entry - * @new: new entry to be added - * @head: list head to add it before - * - * Insert a new entry before the specified head. - * This is useful for implementing queues. - */ -static inline void list_add_tail(struct list_head *new, struct list_head *head) -{ - __list_add(new, head->prev, head); -} - -/* - * Delete a list entry by making the prev/next entries - * point to each other. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static inline void __list_del(struct list_head * prev, struct list_head * next) -{ - next->prev = prev; - prev->next = next; -} - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is in an undefined state. - */ -static inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); -} - -/** - * list_del_init - deletes entry from list and reinitialize it. - * @entry: the element to delete from the list. - */ -static inline void list_del_init(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - INIT_LIST_HEAD(entry); -} - -/** - * list_move - delete from one list and add as another's head - * @list: the entry to move - * @head: the head that will precede our entry - */ -static inline void list_move(struct list_head *list, struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add(list, head); -} - -/** - * list_move_tail - delete from one list and add as another's tail - * @list: the entry to move - * @head: the head that will follow our entry - */ -static inline void list_move_tail(struct list_head *list, - struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add_tail(list, head); -} - -/** - * list_empty - tests whether a list is empty - * @head: the list to test. - */ -static inline int list_empty(struct list_head *head) -{ - return head->next == head; -} - -static inline void __list_splice(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->next; - - first->prev = head; - head->next = first; - - last->next = at; - at->prev = last; -} - -/** - * list_splice - join two lists - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static inline void list_splice(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head); -} - -/** - * list_splice_init - join two lists and reinitialise the emptied list. - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static inline void list_splice_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head); - INIT_LIST_HEAD(list); - } -} - -/** - * list_entry - get the struct for this entry - * @ptr: the &struct list_head pointer. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_struct within the struct. - */ -#define list_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) - -/** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each(pos, head) \ - for (pos = (head)->next, prefetch(pos->next); pos != (head); \ - pos = pos->next, prefetch(pos->next)) - -/** - * list_for_each_prev - iterate over a list in reverse order - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each_prev(pos, head) \ - for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ - pos = pos->prev, prefetch(pos->prev)) - -/** - * list_for_each_safe - iterate over a list safe against removal of list entry - * @pos: the &struct list_head to use as a loop counter. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. - */ -#define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) - -/** - * list_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop counter. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - prefetch(pos->member.next); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member), \ - prefetch(pos->member.next)) - -/** - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop counter. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -#endif /* if !__KERNEL__*/ -#endif /* if !_LUSTRE_LIST_H */ diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h deleted file mode 100644 index fa2709e..0000000 --- a/lustre/portals/include/linux/portals_compat25.h +++ /dev/null @@ -1,96 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _PORTALS_COMPAT_H -#define _PORTALS_COMPAT_H - -// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved -#if SPINLOCK_DEBUG -# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20) -# define SIGNAL_MASK_ASSERT() \ - LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC) -# else -# define SIGNAL_MASK_ASSERT() \ - LASSERT(current->sigmask_lock.magic == SPINLOCK_MAGIC) -# endif -#else -# define SIGNAL_MASK_ASSERT() -#endif -// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - -# define SIGNAL_MASK_LOCK(task, flags) \ - spin_lock_irqsave(&task->sighand->siglock, flags) -# define SIGNAL_MASK_UNLOCK(task, flags) \ - spin_unlock_irqrestore(&task->sighand->siglock, flags) -# define USERMODEHELPER(path, argv, envp) \ - call_usermodehelper(path, argv, envp, 1) -# define RECALC_SIGPENDING recalc_sigpending() -# define CLEAR_SIGPENDING clear_tsk_thread_flag(current, \ - TIF_SIGPENDING) -# define CURRENT_SECONDS get_seconds() -# define smp_num_cpus num_online_cpus() - - -#elif defined(CONFIG_RH_2_4_20) /* RH 2.4.x */ - -# define SIGNAL_MASK_LOCK(task, flags) \ - spin_lock_irqsave(&task->sighand->siglock, flags) -# define SIGNAL_MASK_UNLOCK(task, flags) \ - spin_unlock_irqrestore(&task->sighand->siglock, flags) -# define USERMODEHELPER(path, argv, envp) \ - call_usermodehelper(path, argv, envp) -# define RECALC_SIGPENDING recalc_sigpending() -# define CLEAR_SIGPENDING (current->sigpending = 0) -# define CURRENT_SECONDS CURRENT_TIME - -#else /* 2.4.x */ - -# define SIGNAL_MASK_LOCK(task, flags) \ - spin_lock_irqsave(&task->sigmask_lock, flags) -# define SIGNAL_MASK_UNLOCK(task, flags) \ - spin_unlock_irqrestore(&task->sigmask_lock, flags) -# define USERMODEHELPER(path, argv, envp) \ - call_usermodehelper(path, argv, envp) -# define RECALC_SIGPENDING recalc_sigpending(current) -# define CLEAR_SIGPENDING (current->sigpending = 0) -# define CURRENT_SECONDS CURRENT_TIME - -#endif - -#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) -#define UML_PID(tsk) ((tsk)->thread.extern_pid) -#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define UML_PID(tsk) ((tsk)->thread.mode.tt.extern_pid) -#else -#define UML_PID(tsk) ((tsk)->pid) -#endif - -#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# define THREAD_NAME(comm, len, fmt, a...) \ - snprintf(comm, len,fmt"|%d", ## a, UML_PID(current)) -#else -# define THREAD_NAME(comm, len, fmt, a...) \ - snprintf(comm, len, fmt, ## a) -#endif - -#ifdef HAVE_PAGE_LIST -/* 2.4 alloc_page users can use page->list */ -#define PAGE_LIST_ENTRY list -#define PAGE_LIST(page) ((page)->list) -#else -/* 2.6 alloc_page users can use page->lru */ -#define PAGE_LIST_ENTRY lru -#define PAGE_LIST(page) ((page)->lru) -#endif - -#ifndef HAVE_CPU_ONLINE -#define cpu_online(cpu) (test_bit(cpu_online_map, &(cpu))) -#endif -#ifndef HAVE_CPUMASK_T -#define cpu_set(cpu, map) (set_bit(cpu, &(map))) -typedef unsigned long cpumask_t; -#endif - -#endif /* _PORTALS_COMPAT_H */ diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h deleted file mode 100644 index 8778a52..0000000 --- a/lustre/portals/include/linux/portals_lib.h +++ /dev/null @@ -1,90 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Basic library routines. - * - */ - -#ifndef _PORTALS_LIB_H -#define _PORTALS_LIB_H - -#ifndef __KERNEL__ -# include -#else -# include -#endif - -#undef MIN -#define MIN(a,b) (((a)<(b)) ? (a): (b)) -#undef MAX -#define MAX(a,b) (((a)>(b)) ? (a): (b)) -#define MKSTR(ptr) ((ptr))? (ptr) : "" - -static inline int size_round (int val) -{ - return (val + 7) & (~0x7); -} - -static inline int size_round16(int val) -{ - return (val + 0xf) & (~0xf); -} - -static inline int size_round32(int val) -{ - return (val + 0x1f) & (~0x1f); -} - -static inline int size_round0(int val) -{ - if (!val) - return 0; - return (val + 1 + 7) & (~0x7); -} - -static inline size_t round_strlen(char *fset) -{ - return size_round(strlen(fset) + 1); -} - -#define LOGL(var,len,ptr) \ -do { \ - if (var) \ - memcpy((char *)ptr, (const char *)var, len); \ - ptr += size_round(len); \ -} while (0) - -#define LOGU(var,len,ptr) \ -do { \ - if (var) \ - memcpy((char *)var, (const char *)ptr, len); \ - ptr += size_round(len); \ -} while (0) - -#define LOGL0(var,len,ptr) \ -do { \ - if (!len) \ - break; \ - memcpy((char *)ptr, (const char *)var, len); \ - *((char *)(ptr) + len) = 0; \ - ptr += size_round(len + 1); \ -} while (0) - -#endif /* _PORTALS_LIB_H */ diff --git a/lustre/portals/include/portals/.cvsignore b/lustre/portals/include/portals/.cvsignore deleted file mode 100644 index 282522d..0000000 --- a/lustre/portals/include/portals/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -Makefile -Makefile.in diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am deleted file mode 100644 index 4043f66..0000000 --- a/lustre/portals/include/portals/Makefile.am +++ /dev/null @@ -1,10 +0,0 @@ -portalsdir=$(includedir)/portals - -if UTILS -portals_HEADERS = list.h -endif - -EXTRA_DIST = api.h api-support.h build_check.h errno.h \ - internal.h lib-p30.h lib-types.h list.h \ - lltrace.h myrnal.h nal.h nalids.h p30.h ptlctl.h \ - socknal.h stringtab.h types.h diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h deleted file mode 100644 index c5994c6..0000000 --- a/lustre/portals/include/portals/api-support.h +++ /dev/null @@ -1,22 +0,0 @@ - -#include "build_check.h" - -#ifndef __KERNEL__ -# include -# include -# include -# include - -/* Lots of POSIX dependencies to support PtlEQWait_timeout */ -# include -# include -# include -#endif - -#include -#include -#include - -#include -#include - diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h deleted file mode 100644 index 56b7b99..0000000 --- a/lustre/portals/include/portals/api.h +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef P30_API_H -#define P30_API_H - -#include "build_check.h" - -#include - -int PtlInit(int *); -void PtlFini(void); - -int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, - ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, - ptl_handle_ni_t *interface_out); - -int PtlNIInitialized(ptl_interface_t); - -int PtlNIFini(ptl_handle_ni_t interface_in); - -int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); - -int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid); - - -/* - * Network interfaces - */ - -int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t * status_out); - -int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, - unsigned long *distance_out); - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); - - -/* - * PtlNIFailNid - * - * Not an official Portals 3 API call. It provides a way of simulating - * communications failures to all (nid == PTL_NID_ANY), or specific peers - * (via multiple calls), either until further notice (threshold == -1), or - * for a specific number of messages. Passing a threshold of zero, "heals" - * the given peer. - */ -int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); - -/* - * PtlSnprintHandle: - * - * This is not an official Portals 3 API call. It is provided - * so that an application can print an opaque handle. - */ -void PtlSnprintHandle (char *str, int str_len, ptl_handle_any_t handle); - -/* - * Match entries - */ - -int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, - ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, - ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); - -int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, - ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, - ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, - ptl_handle_me_t * handle_out); - -int PtlMEUnlink(ptl_handle_me_t current_in); - -int PtlMEUnlinkList(ptl_handle_me_t current_in); - - - -/* - * Memory descriptors - */ - -int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); - -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); - -int PtlMDUnlink(ptl_handle_md_t md_in); - -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, - ptl_md_t * new_inout, ptl_handle_eq_t testq_in); - - -/* These should not be called by users */ -int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, - ptl_md_t * new_inout, ptl_handle_eq_t testq_in, - ptl_seq_t sequence_in); - - - - -/* - * Event queues - */ -int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, - ptl_eq_handler_t handler, - ptl_handle_eq_t *handle_out); -int PtlEQFree(ptl_handle_eq_t eventq_in); - -int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); - - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); - -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out); - -/* - * Access Control Table - */ -int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, - ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); - - -/* - * Data movement - */ - -int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, - ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, - ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); - -int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, - ptl_match_bits_t match_bits_in, ptl_size_t offset_in); - - - -#endif diff --git a/lustre/portals/include/portals/build_check.h b/lustre/portals/include/portals/build_check.h deleted file mode 100644 index c219d2a..0000000 --- a/lustre/portals/include/portals/build_check.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _BUILD_CHECK_H -#define _BUILD_CHECK_H - -#if CRAY_PORTALS -#error "an application got to me instead of cray's includes" -#endif - -#endif diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h deleted file mode 100644 index 42f2626..0000000 --- a/lustre/portals/include/portals/errno.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _P30_ERRNO_H_ -#define _P30_ERRNO_H_ - -#include "build_check.h" -/* - * include/portals/errno.h - * - * Shared error number lists - */ - -/* If you change these, you must update the string table in api-errno.c */ -typedef enum { - PTL_OK = 0, - PTL_SEGV = 1, - - PTL_NO_SPACE = 2, - PTL_ME_IN_USE = 3, - PTL_VAL_FAILED = 4, - - PTL_NAL_FAILED = 5, - PTL_NO_INIT = 6, - PTL_IFACE_DUP = 7, - PTL_IFACE_INVALID = 8, - - PTL_HANDLE_INVALID = 9, - PTL_MD_INVALID = 10, - PTL_ME_INVALID = 11, -/* If you change these, you must update the string table in api-errno.c */ - PTL_PROCESS_INVALID = 12, - PTL_PT_INDEX_INVALID = 13, - - PTL_SR_INDEX_INVALID = 14, - PTL_EQ_INVALID = 15, - PTL_EQ_DROPPED = 16, - - PTL_EQ_EMPTY = 17, - PTL_MD_NO_UPDATE = 18, - PTL_FAIL = 19, - - PTL_IOV_INVALID = 20, - - PTL_EQ_IN_USE = 21, - - PTL_NI_INVALID = 22, - PTL_MD_ILLEGAL = 23, - - PTL_MAX_ERRNO = 24 -} ptl_err_t; -/* If you change these, you must update the string table in api-errno.c */ - -extern const char *ptl_err_str[]; - -#endif diff --git a/lustre/portals/include/portals/internal.h b/lustre/portals/include/portals/internal.h deleted file mode 100644 index e69de29..0000000 diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h deleted file mode 100644 index b710569..0000000 --- a/lustre/portals/include/portals/lib-p30.h +++ /dev/null @@ -1,465 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib-p30.h - * - * Top level include for library side routines - */ - -#ifndef _LIB_P30_H_ -#define _LIB_P30_H_ - -#include "build_check.h" - -#ifdef __KERNEL__ -# include -# include -#else -# include -# include -# include -#endif -#include -#include -#include -#include -#include - -static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) -{ - return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && - wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); -} - -#ifdef __KERNEL__ -#define LIB_LOCK(nal,flags) \ - spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) -#define LIB_UNLOCK(nal,flags) \ - spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) -#else -#define LIB_LOCK(nal,flags) \ - (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) -#define LIB_UNLOCK(nal,flags) \ - pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) -#endif - - -#ifdef PTL_USE_LIB_FREELIST - -#define MAX_MES 2048 -#define MAX_MDS 2048 -#define MAX_MSGS 2048 /* Outstanding messages */ -#define MAX_EQS 512 - -extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); - -static inline void * -lib_freelist_alloc (lib_freelist_t *fl) -{ - /* ALWAYS called with liblock held */ - lib_freeobj_t *o; - - if (list_empty (&fl->fl_list)) - return (NULL); - - o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); - list_del (&o->fo_list); - return ((void *)&o->fo_contents); -} - -static inline void -lib_freelist_free (lib_freelist_t *fl, void *obj) -{ - /* ALWAYS called with liblock held */ - lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); - - list_add (&o->fo_list, &fl->fl_list); -} - - -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_eq_t *eq; - - LIB_LOCK (nal, flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); - LIB_UNLOCK (nal, flags); - - return (eq); -} - -static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); -} - -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_md_t *md; - - LIB_LOCK (nal, flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); - LIB_UNLOCK (nal, flags); - - return (md); -} - -static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); -} - -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_me_t *me; - - LIB_LOCK (nal, flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); - LIB_UNLOCK (nal, flags); - - return (me); -} - -static inline void -lib_me_free (lib_nal_t *nal, lib_me_t *me) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); -} - -static inline lib_msg_t * -lib_msg_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_msg_t *msg; - - LIB_LOCK (nal, flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); - LIB_UNLOCK (nal, flags); - - if (msg != NULL) { - /* NULL pointers, clear flags etc */ - memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; - } - return(msg); -} - -static inline void -lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); -} - -#else - -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - lib_eq_t *eq; - - PORTAL_ALLOC(eq, sizeof(*eq)); - return (eq); -} - -static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(eq, sizeof(*eq)); -} - -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) -{ - /* NEVER called with liblock held */ - lib_md_t *md; - int size; - int niov; - - if ((umd->options & PTL_MD_KIOV) != 0) { - niov = umd->length; - size = offsetof(lib_md_t, md_iov.kiov[niov]); - } else { - niov = ((umd->options & PTL_MD_IOVEC) != 0) ? - umd->length : 1; - size = offsetof(lib_md_t, md_iov.iov[niov]); - } - - PORTAL_ALLOC(md, size); - - if (md != NULL) { - /* Set here in case of early free */ - md->options = umd->options; - md->md_niov = niov; - } - - return (md); -} - -static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) -{ - /* ALWAYS called with liblock held */ - int size; - - if ((md->options & PTL_MD_KIOV) != 0) - size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); - else - size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); - - PORTAL_FREE(md, size); -} - -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - lib_me_t *me; - - PORTAL_ALLOC(me, sizeof(*me)); - return (me); -} - -static inline void -lib_me_free(lib_nal_t *nal, lib_me_t *me) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(me, sizeof(*me)); -} - -static inline lib_msg_t * -lib_msg_alloc(lib_nal_t *nal) -{ - /* NEVER called with liblock held; may be in interrupt... */ - lib_msg_t *msg; - - if (in_interrupt()) - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - else - PORTAL_ALLOC(msg, sizeof(*msg)); - - if (msg != NULL) { - /* NULL pointers, clear flags etc */ - memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; - } - return (msg); -} - -static inline void -lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(msg, sizeof(*msg)); -} -#endif - -extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); - -static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) -{ - if (eq == NULL) { - *handle = PTL_EQ_NONE; - return; - } - - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = eq->eq_lh.lh_cookie; -} - -static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_EQ); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_eq_t, eq_lh)); -} - -static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) -{ - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = md->md_lh.lh_cookie; -} - -static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_MD); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_md_t, md_lh)); -} - -static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh; - - if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) - return (NULL); - - lh = lib_lookup_cookie (nal, wh->wh_object_cookie, - PTL_COOKIE_TYPE_MD); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_md_t, md_lh)); -} - -static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) -{ - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = me->me_lh.lh_cookie; -} - -static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_ME); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_me_t, me_lh)); -} - -extern int lib_init(lib_nal_t *libnal, nal_t *apinal, - ptl_process_id_t pid, - ptl_ni_limits_t *desired_limits, - ptl_ni_limits_t *actual_limits); -extern int lib_fini(lib_nal_t *libnal); - -/* - * When the NAL detects an incoming message header, it should call - * lib_parse() decode it. If the message header is garbage, lib_parse() - * returns immediately with failure, otherwise the NAL callbacks will be - * called to receive the message body. They are handed the private cookie - * as a way for the NAL to maintain state about which transaction is being - * processed. An extra parameter, lib_msg contains the lib-level message - * state for passing to lib_finalize() when the message body has been - * received. - */ -extern void lib_enq_event_locked (lib_nal_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_ni_fail_t ni_fail_type); -extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, - lib_msg_t *get_msg); -extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); - - -extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - ptl_size_t offset, ptl_size_t len); - -extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len); - -extern void lib_assert_wire_constants (void); - -extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); - -extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, - ptl_sr_value_t *status); -extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, - unsigned long *dist); - -extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle); -extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); -extern int lib_api_eq_poll (nal_t *nal, - ptl_handle_eq_t *eventqs, int neq, int timeout_ms, - ptl_event_t *event, int *which); - -extern int lib_api_me_attach(nal_t *nal, - ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_insert(nal_t *nal, - ptl_handle_me_t *current_meh, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); -extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); - -extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); - -extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); -extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); -extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); -extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, - ptl_md_t *oldumd, ptl_md_t *newumd, - ptl_handle_eq_t *testqh); - -extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, ptl_size_t offset); -extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_ack_req_t ack, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, - ptl_size_t offset, ptl_hdr_data_t hdr_data); -extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); - -#endif diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h deleted file mode 100644 index cfcef2b..0000000 --- a/lustre/portals/include/portals/lib-types.h +++ /dev/null @@ -1,359 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * p30/lib-types.h - * - * Types used by the library side routines that do not need to be - * exposed to the user application - */ - -#ifndef _LIB_TYPES_H_ -#define _LIB_TYPES_H_ - -#include "build_check.h" - -#include -#include -#ifdef __KERNEL__ -# include -# include -# include -#else -# define PTL_USE_LIB_FREELIST -# include -#endif - -typedef char *user_ptr; -typedef struct lib_msg_t lib_msg_t; -typedef struct lib_ptl_t lib_ptl_t; -typedef struct lib_ac_t lib_ac_t; -typedef struct lib_me_t lib_me_t; -typedef struct lib_md_t lib_md_t; -typedef struct lib_eq_t lib_eq_t; - -#define WIRE_ATTR __attribute__((packed)) - -/* The wire handle's interface cookie only matches one network interface in - * one epoch (i.e. new cookie when the interface restarts or the node - * reboots). The object cookie only matches one object on that interface - * during that object's lifetime (i.e. no cookie re-use). */ -typedef struct { - __u64 wh_interface_cookie; - __u64 wh_object_cookie; -} WIRE_ATTR ptl_handle_wire_t; - -/* byte-flip insensitive! */ -#define PTL_WIRE_HANDLE_NONE \ -((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) - -typedef enum { - PTL_MSG_ACK = 0, - PTL_MSG_PUT, - PTL_MSG_GET, - PTL_MSG_REPLY, - PTL_MSG_HELLO, -} ptl_msg_type_t; - -/* The variant fields of the portals message header are aligned on an 8 - * byte boundary in the message header. Note that all types used in these - * wire structs MUST be fixed size and the smaller types are placed at the - * end. */ -typedef struct ptl_ack { - ptl_handle_wire_t dst_wmd; - ptl_match_bits_t match_bits; - ptl_size_t mlength; -} WIRE_ATTR ptl_ack_t; - -typedef struct ptl_put { - ptl_handle_wire_t ack_wmd; - ptl_match_bits_t match_bits; - ptl_hdr_data_t hdr_data; - ptl_pt_index_t ptl_index; - ptl_size_t offset; -} WIRE_ATTR ptl_put_t; - -typedef struct ptl_get { - ptl_handle_wire_t return_wmd; - ptl_match_bits_t match_bits; - ptl_pt_index_t ptl_index; - ptl_size_t src_offset; - ptl_size_t sink_length; -} WIRE_ATTR ptl_get_t; - -typedef struct ptl_reply { - ptl_handle_wire_t dst_wmd; -} WIRE_ATTR ptl_reply_t; - -typedef struct ptl_hello { - __u64 incarnation; - __u32 type; -} WIRE_ATTR ptl_hello_t; - -typedef struct { - ptl_nid_t dest_nid; - ptl_nid_t src_nid; - ptl_pid_t dest_pid; - ptl_pid_t src_pid; - __u32 type; /* ptl_msg_type_t */ - __u32 payload_length; /* payload data to follow */ - /*<------__u64 aligned------->*/ - union { - ptl_ack_t ack; - ptl_put_t put; - ptl_get_t get; - ptl_reply_t reply; - ptl_hello_t hello; - } msg; -} WIRE_ATTR ptl_hdr_t; - -/* A HELLO message contains the portals magic number and protocol version - * code in the header's dest_nid, the peer's NID in the src_nid, and - * PTL_MSG_HELLO in the type field. All other common fields are zero - * (including payload_size; i.e. no payload). - * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is - * running the same protocol and to find out its NID, so that hosts with - * multiple IP interfaces can have a single NID. These NALs should exchange - * HELLO messages when a connection is first established. - * Individual NALs can put whatever else they fancy in ptl_hdr_t::msg. - */ -typedef struct { - __u32 magic; /* PORTALS_PROTO_MAGIC */ - __u16 version_major; /* increment on incompatible change */ - __u16 version_minor; /* increment on compatible change */ -} WIRE_ATTR ptl_magicversion_t; - -#define PORTALS_PROTO_MAGIC 0xeebc0ded - -#define PORTALS_PROTO_VERSION_MAJOR 1 -#define PORTALS_PROTO_VERSION_MINOR 0 - -typedef struct { - long recv_count, recv_length, send_count, send_length, drop_count, - drop_length, msgs_alloc, msgs_max; -} lib_counters_t; - -/* temporary expedient: limit number of entries in discontiguous MDs */ -#define PTL_MTU (1<<20) -#define PTL_MD_MAX_IOV 256 - -struct lib_msg_t { - struct list_head msg_list; - lib_md_t *md; - ptl_handle_wire_t ack_wmd; - ptl_event_t ev; -}; - -struct lib_ptl_t { - ptl_pt_index_t size; - struct list_head *tbl; -}; - -struct lib_ac_t { - int next_free; -}; - -typedef struct { - struct list_head lh_hash_chain; - __u64 lh_cookie; -} lib_handle_t; - -#define lh_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) - -struct lib_eq_t { - struct list_head eq_list; - lib_handle_t eq_lh; - ptl_seq_t eq_enq_seq; - ptl_seq_t eq_deq_seq; - ptl_size_t eq_size; - ptl_event_t *eq_events; - int eq_refcount; - ptl_eq_handler_t eq_callback; - void *eq_addrkey; -}; - -struct lib_me_t { - struct list_head me_list; - lib_handle_t me_lh; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits, ignore_bits; - ptl_unlink_t unlink; - lib_md_t *md; -}; - -struct lib_md_t { - struct list_head md_list; - lib_handle_t md_lh; - lib_me_t *me; - user_ptr start; - ptl_size_t offset; - ptl_size_t length; - ptl_size_t max_size; - int threshold; - int pending; - unsigned int options; - unsigned int md_flags; - void *user_ptr; - lib_eq_t *eq; - void *md_addrkey; - unsigned int md_niov; /* # frags */ - union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; - } md_iov; -}; - -#define PTL_MD_FLAG_ZOMBIE (1 << 0) -#define PTL_MD_FLAG_AUTO_UNLINK (1 << 1) - -static inline int lib_md_exhausted (lib_md_t *md) -{ - return (md->threshold == 0 || - ((md->options & PTL_MD_MAX_SIZE) != 0 && - md->offset + md->max_size > md->length)); -} - -#ifdef PTL_USE_LIB_FREELIST -typedef struct -{ - void *fl_objs; /* single contiguous array of objects */ - int fl_nobjs; /* the number of them */ - int fl_objsize; /* the size (including overhead) of each of them */ - struct list_head fl_list; /* where they are enqueued */ -} lib_freelist_t; - -typedef struct -{ - struct list_head fo_list; /* enqueue on fl_list */ - void *fo_contents; /* aligned contents */ -} lib_freeobj_t; -#endif - -typedef struct { - /* info about peers we are trying to fail */ - struct list_head tp_list; /* stash in ni.ni_test_peers */ - ptl_nid_t tp_nid; /* matching nid */ - unsigned int tp_threshold; /* # failures to simulate */ -} lib_test_peer_t; - -#define PTL_COOKIE_TYPE_MD 1 -#define PTL_COOKIE_TYPE_ME 2 -#define PTL_COOKIE_TYPE_EQ 3 -#define PTL_COOKIE_TYPES 4 -/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be - * extracted by masking with (PTL_COOKIE_TYPES - 1) */ - -typedef struct lib_ni -{ - nal_t *ni_api; - ptl_process_id_t ni_pid; - lib_ptl_t ni_portals; - lib_counters_t ni_counters; - ptl_ni_limits_t ni_actual_limits; - - int ni_lh_hash_size; /* size of lib handle hash table */ - struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ - __u64 ni_next_object_cookie; /* cookie generator */ - __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ - - struct list_head ni_test_peers; - -#ifdef PTL_USE_LIB_FREELIST - lib_freelist_t ni_free_mes; - lib_freelist_t ni_free_msgs; - lib_freelist_t ni_free_mds; - lib_freelist_t ni_free_eqs; -#endif - - struct list_head ni_active_msgs; - struct list_head ni_active_mds; - struct list_head ni_active_eqs; - -#ifdef __KERNEL__ - spinlock_t ni_lock; - wait_queue_head_t ni_waitq; -#else - pthread_mutex_t ni_mutex; - pthread_cond_t ni_cond; -#endif -} lib_ni_t; - - -typedef struct lib_nal -{ - /* lib-level interface state */ - lib_ni_t libnal_ni; - - /* NAL-private data */ - void *libnal_data; - - /* - * send: Sends a preformatted header and payload data to a - * specified remote process. The payload is scattered over 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to send and will call - * lib_finalize on completion - */ - ptl_err_t (*libnal_send) - (struct lib_nal *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen); - - /* as send, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*libnal_send_pages) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen); - /* - * recv: Receives an incoming message from a remote process. The - * payload is to be received into the scattered buffer of 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. Payload bytes after 'mlen' up to 'rlen' are to be - * discarded. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to receive and will call - * lib_finalize on completion - */ - ptl_err_t (*libnal_recv) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); - - /* as recv, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*libnal_recv_pages) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen, size_t rlen); - - /* - * (un)map: Tell the NAL about some memory it will access. - * *addrkey passed to libnal_unmap() is what libnal_map() set it to. - * type of *iov depends on options. - * Set to NULL if not required. - */ - ptl_err_t (*libnal_map) - (struct lib_nal *nal, unsigned int niov, struct iovec *iov, - void **addrkey); - void (*libnal_unmap) - (struct lib_nal *nal, unsigned int niov, struct iovec *iov, - void **addrkey); - - /* as (un)map, but with a set of page fragments */ - ptl_err_t (*libnal_map_pages) - (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - void (*libnal_unmap_pages) - (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - - void (*libnal_printf)(struct lib_nal *nal, const char *fmt, ...); - - /* Calculate a network "distance" to given node */ - int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist); -} lib_nal_t; - -#endif diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h deleted file mode 100644 index 37d9952..0000000 --- a/lustre/portals/include/portals/list.h +++ /dev/null @@ -1,243 +0,0 @@ -#ifndef _LINUX_LIST_H -/* - * Simple doubly linked list implementation. - * - * Some of the internal functions ("__xxx") are useful when - * manipulating whole lists rather than single entries, as - * sometimes we already know the next/prev entries and we can - * generate better code by using them directly rather than - * using the generic single-entry routines. - */ - -struct list_head { - struct list_head *next, *prev; -}; - -typedef struct list_head list_t; - -#define LIST_HEAD_INIT(name) { &(name), &(name) } - -#define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) - -#define INIT_LIST_HEAD(ptr) do { \ - (ptr)->next = (ptr); (ptr)->prev = (ptr); \ -} while (0) - -/* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static inline void __list_add(struct list_head * new, - struct list_head * prev, - struct list_head * next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} - -/** - * list_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -static inline void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} - -/** - * list_add_tail - add a new entry - * @new: new entry to be added - * @head: list head to add it before - * - * Insert a new entry before the specified head. - * This is useful for implementing queues. - */ -static inline void list_add_tail(struct list_head *new, struct list_head *head) -{ - __list_add(new, head->prev, head); -} - -/* - * Delete a list entry by making the prev/next entries - * point to each other. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static inline void __list_del(struct list_head * prev, struct list_head * next) -{ - next->prev = prev; - prev->next = next; -} - -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is in an undefined state. - */ -static inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); -} - -/** - * list_del_init - deletes entry from list and reinitialize it. - * @entry: the element to delete from the list. - */ -static inline void list_del_init(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - INIT_LIST_HEAD(entry); -} -#endif - -#ifndef list_for_each_entry -/** - * list_move - delete from one list and add as another's head - * @list: the entry to move - * @head: the head that will precede our entry - */ -static inline void list_move(struct list_head *list, struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add(list, head); -} - -/** - * list_move_tail - delete from one list and add as another's tail - * @list: the entry to move - * @head: the head that will follow our entry - */ -static inline void list_move_tail(struct list_head *list, - struct list_head *head) -{ - __list_del(list->prev, list->next); - list_add_tail(list, head); -} -#endif - -#ifndef _LINUX_LIST_H -#define _LINUX_LIST_H -/** - * list_empty - tests whether a list is empty - * @head: the list to test. - */ -static inline int list_empty(struct list_head *head) -{ - return head->next == head; -} - -static inline void __list_splice(struct list_head *list, - struct list_head *head) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - struct list_head *at = head->next; - - first->prev = head; - head->next = first; - - last->next = at; - at->prev = last; -} - -/** - * list_splice - join two lists - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static inline void list_splice(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head); -} - -/** - * list_splice_init - join two lists and reinitialise the emptied list. - * @list: the new list to add. - * @head: the place to add it in the first list. - * - * The list at @list is reinitialised - */ -static inline void list_splice_init(struct list_head *list, - struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head); - INIT_LIST_HEAD(list); - } -} - -/** - * list_entry - get the struct for this entry - * @ptr: the &struct list_head pointer. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_struct within the struct. - */ -#define list_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) - -/** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each(pos, head) \ - for (pos = (head)->next ; pos != (head); pos = pos->next ) - -/** - * list_for_each_prev - iterate over a list in reverse order - * @pos: the &struct list_head to use as a loop counter. - * @head: the head for your list. - */ -#define list_for_each_prev(pos, head) \ - for (pos = (head)->prev ; pos != (head); pos = pos->prev) - -/** - * list_for_each_safe - iterate over a list safe against removal of list entry - * @pos: the &struct list_head to use as a loop counter. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. - */ -#define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) - -#endif - -#ifndef list_for_each_entry -/** - * list_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop counter. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) -#endif - -#ifndef list_for_each_entry_safe -/** - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop counter. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) -#endif diff --git a/lustre/portals/include/portals/lltrace.h b/lustre/portals/include/portals/lltrace.h deleted file mode 100644 index 3e01df1..0000000 --- a/lustre/portals/include/portals/lltrace.h +++ /dev/null @@ -1,173 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Compile with: - * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl - */ -#ifndef __LTRACE_H_ -#define __LTRACE_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static inline int ltrace_write_file(char* fname) -{ - char* argv[3]; - - argv[0] = "debug_kernel"; - argv[1] = fname; - argv[2] = "1"; - - fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]); - - return jt_dbg_debug_kernel(3, argv); -} - -static inline int ltrace_clear() -{ - char* argv[1]; - - argv[0] = "clear"; - - fprintf(stderr, "[ptlctl] %s\n", argv[0]); - - return jt_dbg_clear_debug_buf(1, argv); -} - -static inline int ltrace_mark(int indent_level, char* text) -{ - char* argv[2]; - char mark_buf[PATH_MAX]; - - snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text); - - argv[0] = "mark"; - argv[1] = mark_buf; - return jt_dbg_mark_debug_buf(2, argv); -} - -static inline int ltrace_applymasks() -{ - char* argv[2]; - argv[0] = "list"; - argv[1] = "applymasks"; - - fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]); - - return jt_dbg_list(2, argv); -} - - -static inline int ltrace_filter(char* subsys_or_mask) -{ - char* argv[2]; - argv[0] = "filter"; - argv[1] = subsys_or_mask; - return jt_dbg_filter(2, argv); -} - -static inline int ltrace_show(char* subsys_or_mask) -{ - char* argv[2]; - argv[0] = "show"; - argv[1] = subsys_or_mask; - return jt_dbg_show(2, argv); -} - -static inline int ltrace_start() -{ - int rc = 0; - dbg_initialize(0, NULL); -#ifdef PORTALS_DEV_ID - rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); -#endif - ltrace_filter("class"); - ltrace_filter("nal"); - ltrace_filter("portals"); - - ltrace_show("all_types"); - ltrace_filter("trace"); - ltrace_filter("malloc"); - ltrace_filter("net"); - ltrace_filter("page"); - ltrace_filter("other"); - ltrace_filter("info"); - ltrace_applymasks(); - - return rc; -} - - -static inline void ltrace_stop() -{ -#ifdef PORTALS_DEV_ID - unregister_ioc_dev(PORTALS_DEV_ID); -#endif -} - -static inline int not_uml() -{ - /* Return Values: - * 0 when run under UML - * 1 when run on host - * <0 when lookup failed - */ - struct stat buf; - int rc = stat("/dev/ubd", &buf); - rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; - if (rc<0) { - fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); - rc = 1; /* Assume host */ - } - return rc; -} - -#define LTRACE_MAX_NOB 256 -static inline void ltrace_add_processnames(char* fname) -{ - char cmdbuf[LTRACE_MAX_NOB]; - struct timeval tv; - struct timezone tz; - int nob; - int underuml = !not_uml(); - - gettimeofday(&tv, &tz); - - nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \""); - - /* Careful - these format strings need to match the CDEBUG - * formats in portals/linux/debug.c EXACTLY - */ - nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ", - S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec); - - if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) { - nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, - "(%s:%d:%s() %d | %d+%lu): ", - "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L); - } - else { - nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, - "(%s:%d:%s() %d+%lu): ", - "lltrace.h", __LINE__, __FUNCTION__, 0, 0L); - } - - nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname); - system(cmdbuf); -} - -#endif diff --git a/lustre/portals/include/portals/myrnal.h b/lustre/portals/include/portals/myrnal.h deleted file mode 100644 index 13790f7..0000000 --- a/lustre/portals/include/portals/myrnal.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef MYRNAL_H -#define MYRNAL_H - -#define MAX_ARGS_LEN (256) -#define MAX_RET_LEN (128) -#define MYRNAL_MAX_ACL_SIZE (64) -#define MYRNAL_MAX_PTL_SIZE (64) - -#define P3CMD (100) -#define P3SYSCALL (200) -#define P3REGISTER (300) - -enum { PTL_MLOCKALL }; - -typedef struct { - void *args; - size_t args_len; - void *ret; - size_t ret_len; - int p3cmd; -} myrnal_forward_t; - -#endif /* MYRNAL_H */ diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h deleted file mode 100644 index bf86569..0000000 --- a/lustre/portals/include/portals/nal.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef _NAL_H_ -#define _NAL_H_ - -#include "build_check.h" - -/* - * p30/nal.h - * - * The API side NAL declarations - */ - -#include - -typedef struct nal_t nal_t; - -struct nal_t { - /* common interface state */ - int nal_refct; - ptl_handle_ni_t nal_handle; - - /* NAL-private data */ - void *nal_data; - - /* NAL API implementation - * NB only nal_ni_init needs to be set when the NAL registers itself */ - int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *req, ptl_ni_limits_t *actual); - - void (*nal_ni_fini) (nal_t *nal); - - int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id); - int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status); - int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance); - int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold); - - int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); - int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); - int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me); - - int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me, - ptl_md_t *md, ptl_unlink_t unlink, - ptl_handle_md_t *handle); - int (*nal_md_bind) (nal_t *nal, - ptl_md_t *md, ptl_unlink_t unlink, - ptl_handle_md_t *handle); - int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md); - int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md, - ptl_md_t *old_md, ptl_md_t *new_md, - ptl_handle_eq_t *testq); - - int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count, - ptl_eq_handler_t handler, - ptl_handle_eq_t *handle); - int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq); - int (*nal_eq_poll) (nal_t *nal, - ptl_handle_eq_t *eqs, int neqs, int timeout, - ptl_event_t *event, int *which); - - int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index, - ptl_process_id_t match_id, ptl_pt_index_t portal); - - int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack, - ptl_process_id_t *target, ptl_pt_index_t portal, - ptl_ac_index_t ac, ptl_match_bits_t match, - ptl_size_t offset, ptl_hdr_data_t hdr_data); - int (*nal_get) (nal_t *nal, ptl_handle_md_t *md, - ptl_process_id_t *target, ptl_pt_index_t portal, - ptl_ac_index_t ac, ptl_match_bits_t match, - ptl_size_t offset); -}; - -extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any); - -#ifdef __KERNEL__ -extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal); -extern void ptl_unregister_nal(ptl_interface_t interface); -#endif - -#endif diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h deleted file mode 100644 index 55a991b..0000000 --- a/lustre/portals/include/portals/nalids.h +++ /dev/null @@ -1,2 +0,0 @@ -#include "build_check.h" - diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h deleted file mode 100644 index 4b8631d..0000000 --- a/lustre/portals/include/portals/p30.h +++ /dev/null @@ -1,26 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _P30_H_ -#define _P30_H_ - -#include "build_check.h" - -/* - * p30.h - * - * User application interface file - */ - -#if defined (__KERNEL__) -#include -#include -#else -#include -#include -#endif - -#include -#include - -#endif diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h deleted file mode 100644 index ef52a25..0000000 --- a/lustre/portals/include/portals/ptlctl.h +++ /dev/null @@ -1,96 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * header for libptlctl.a - */ -#ifndef _PTLCTL_H_ -#define _PTLCTL_H_ - -#include -#include -#include - -#define PORTALS_DEV_ID 0 -#define PORTALS_DEV_PATH "/dev/portals" -#define OBD_DEV_ID 1 -#define OBD_DEV_PATH "/dev/obd" -#define SMFS_DEV_ID 2 -#define SMFS_DEV_PATH "/dev/snapdev" - -int ptl_name2nal(char *str); -int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); -int ptl_parse_nid (ptl_nid_t *nidp, char *str); -char * ptl_nid2str (char *buffer, ptl_nid_t nid); - -int ptl_initialize(int argc, char **argv); -int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_interfaces(int argc, char **argv); -int jt_ptl_add_interface(int argc, char **argv); -int jt_ptl_del_interface(int argc, char **argv); -int jt_ptl_print_peers (int argc, char **argv); -int jt_ptl_add_peer (int argc, char **argv); -int jt_ptl_del_peer (int argc, char **argv); -int jt_ptl_print_connections (int argc, char **argv); -int jt_ptl_connect(int argc, char **argv); -int jt_ptl_disconnect(int argc, char **argv); -int jt_ptl_push_connection(int argc, char **argv); -int jt_ptl_print_active_txs(int argc, char **argv); -int jt_ptl_ping(int argc, char **argv); -int jt_ptl_shownid(int argc, char **argv); -int jt_ptl_mynid(int argc, char **argv); -int jt_ptl_add_uuid(int argc, char **argv); -int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ -int jt_ptl_close_uuid(int argc, char **argv); -int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_add_route (int argc, char **argv); -int jt_ptl_del_route (int argc, char **argv); -int jt_ptl_notify_router (int argc, char **argv); -int jt_ptl_print_routes (int argc, char **argv); -int jt_ptl_fail_nid (int argc, char **argv); -int jt_ptl_lwt(int argc, char **argv); -int jt_ptl_memhog(int argc, char **argv); - -int dbg_initialize(int argc, char **argv); -int jt_dbg_filter(int argc, char **argv); -int jt_dbg_show(int argc, char **argv); -int jt_dbg_list(int argc, char **argv); -int jt_dbg_debug_kernel(int argc, char **argv); -int jt_dbg_debug_daemon(int argc, char **argv); -int jt_dbg_debug_file(int argc, char **argv); -int jt_dbg_clear_debug_buf(int argc, char **argv); -int jt_dbg_mark_debug_buf(int argc, char **argv); -int jt_dbg_modules(int argc, char **argv); -int jt_dbg_panic(int argc, char **argv); - -int ptl_set_cfg_record_cb(cfg_record_cb_t cb); - -/* l_ioctl.c */ -typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf); -void set_ioc_handler(ioc_handler_t *handler); -int register_ioc_dev(int dev_id, const char * dev_name); -void unregister_ioc_dev(int dev_id); -int set_ioctl_dump(char * file); -int l_ioctl(int dev_id, unsigned int opc, void *buf); -int parse_dump(char * dump_file, ioc_handler_t ioc_func); -int jt_ioc_dump(int argc, char **argv); -extern char *dump_filename; -int dump(int dev_id, unsigned int opc, void *buf); - -#endif diff --git a/lustre/portals/include/portals/socknal.h b/lustre/portals/include/portals/socknal.h deleted file mode 100644 index 27e6f8e..0000000 --- a/lustre/portals/include/portals/socknal.h +++ /dev/null @@ -1,14 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * - * - * #defines shared between socknal implementation and utilities - */ - -#define SOCKNAL_CONN_NONE (-1) -#define SOCKNAL_CONN_ANY 0 -#define SOCKNAL_CONN_CONTROL 1 -#define SOCKNAL_CONN_BULK_IN 2 -#define SOCKNAL_CONN_BULK_OUT 3 -#define SOCKNAL_CONN_NTYPES 4 diff --git a/lustre/portals/include/portals/stringtab.h b/lustre/portals/include/portals/stringtab.h deleted file mode 100644 index 33e4375..0000000 --- a/lustre/portals/include/portals/stringtab.h +++ /dev/null @@ -1,3 +0,0 @@ -/* - * stringtab.h - */ diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h deleted file mode 100644 index 0bada40..0000000 --- a/lustre/portals/include/portals/types.h +++ /dev/null @@ -1,193 +0,0 @@ -#ifndef _P30_TYPES_H_ -#define _P30_TYPES_H_ - -#include "build_check.h" - -#include -#include - -/* This implementation uses the same type for API function return codes and - * the completion status in an event */ -#define PTL_NI_OK PTL_OK -typedef ptl_err_t ptl_ni_fail_t; - -typedef __u32 ptl_uid_t; -typedef __u32 ptl_jid_t; -typedef __u64 ptl_nid_t; -typedef __u32 ptl_pid_t; -typedef __u32 ptl_pt_index_t; -typedef __u32 ptl_ac_index_t; -typedef __u64 ptl_match_bits_t; -typedef __u64 ptl_hdr_data_t; -typedef __u32 ptl_size_t; - -#define PTL_TIME_FOREVER (-1) - -typedef struct { - unsigned long nal_idx; /* which network interface */ - __u64 cookie; /* which thing on that interface */ -} ptl_handle_any_t; - -typedef ptl_handle_any_t ptl_handle_ni_t; -typedef ptl_handle_any_t ptl_handle_eq_t; -typedef ptl_handle_any_t ptl_handle_md_t; -typedef ptl_handle_any_t ptl_handle_me_t; - -#define PTL_INVALID_HANDLE \ - ((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) -#define PTL_EQ_NONE PTL_INVALID_HANDLE - -static inline int PtlHandleIsEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) -{ - return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); -} - -#define PTL_UID_ANY ((ptl_uid_t) -1) -#define PTL_JID_ANY ((ptl_jid_t) -1) -#define PTL_NID_ANY ((ptl_nid_t) -1) -#define PTL_PID_ANY ((ptl_pid_t) -1) - -typedef struct { - ptl_nid_t nid; - ptl_pid_t pid; /* node id / process id */ -} ptl_process_id_t; - -typedef enum { - PTL_RETAIN = 0, - PTL_UNLINK -} ptl_unlink_t; - -typedef enum { - PTL_INS_BEFORE, - PTL_INS_AFTER -} ptl_ins_pos_t; - -typedef struct { - void *start; - ptl_size_t length; - int threshold; - int max_size; - unsigned int options; - void *user_ptr; - ptl_handle_eq_t eq_handle; -} ptl_md_t; - -/* Options for the MD structure */ -#define PTL_MD_OP_PUT (1 << 0) -#define PTL_MD_OP_GET (1 << 1) -#define PTL_MD_MANAGE_REMOTE (1 << 2) -/* unused (1 << 3) */ -#define PTL_MD_TRUNCATE (1 << 4) -#define PTL_MD_ACK_DISABLE (1 << 5) -#define PTL_MD_IOVEC (1 << 6) -#define PTL_MD_MAX_SIZE (1 << 7) -#define PTL_MD_KIOV (1 << 8) -#define PTL_MD_EVENT_START_DISABLE (1 << 9) -#define PTL_MD_EVENT_END_DISABLE (1 << 10) - -/* For compatibility with Cray Portals */ -#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 -#define PTL_MD_PHYS 0 - -#define PTL_MD_THRESH_INF (-1) - -/* NB lustre portals uses struct iovec internally! */ -typedef struct iovec ptl_md_iovec_t; - -typedef struct { - struct page *kiov_page; - unsigned int kiov_len; - unsigned int kiov_offset; -} ptl_kiov_t; - -typedef enum { - PTL_EVENT_GET_START, - PTL_EVENT_GET_END, - - PTL_EVENT_PUT_START, - PTL_EVENT_PUT_END, - - PTL_EVENT_REPLY_START, - PTL_EVENT_REPLY_END, - - PTL_EVENT_ACK, - - PTL_EVENT_SEND_START, - PTL_EVENT_SEND_END, - - PTL_EVENT_UNLINK, -} ptl_event_kind_t; - -#define PTL_SEQ_BASETYPE long -typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; -#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) - -/* XXX - * cygwin need the pragma line, not clear if it's needed in other places. - * checking!!! - */ -#ifdef __CYGWIN__ -#pragma pack(push, 4) -#endif -typedef struct { - ptl_event_kind_t type; - ptl_process_id_t initiator; - ptl_uid_t uid; - ptl_jid_t jid; - ptl_pt_index_t pt_index; - ptl_match_bits_t match_bits; - ptl_size_t rlength; - ptl_size_t mlength; - ptl_size_t offset; - ptl_handle_md_t md_handle; - ptl_md_t md; - ptl_hdr_data_t hdr_data; - ptl_seq_t link; - ptl_ni_fail_t ni_fail_type; - - int unlinked; - - volatile ptl_seq_t sequence; -} ptl_event_t; -#ifdef __CYGWIN__ -#pragma pop -#endif - -typedef enum { - PTL_ACK_REQ, - PTL_NOACK_REQ -} ptl_ack_req_t; - -typedef void (*ptl_eq_handler_t)(ptl_event_t *event); -#define PTL_EQ_HANDLER_NONE NULL - -typedef struct { - int max_mes; - int max_mds; - int max_eqs; - int max_ac_index; - int max_pt_index; - int max_md_iovecs; - int max_me_list; - int max_getput_md; -} ptl_ni_limits_t; - -/* - * Status registers - */ -typedef enum { - PTL_SR_DROP_COUNT, - PTL_SR_DROP_LENGTH, - PTL_SR_RECV_COUNT, - PTL_SR_RECV_LENGTH, - PTL_SR_SEND_COUNT, - PTL_SR_SEND_LENGTH, - PTL_SR_MSGS_MAX, -} ptl_sr_index_t; - -typedef int ptl_sr_value_t; - -typedef int ptl_interface_t; -#define PTL_IFACE_DEFAULT (-1) - -#endif diff --git a/lustre/portals/knals/.cvsignore b/lustre/portals/knals/.cvsignore deleted file mode 100644 index f5fd0b0..0000000 --- a/lustre/portals/knals/.cvsignore +++ /dev/null @@ -1,5 +0,0 @@ -Makefile -autoMakefile -autoMakefile.in -.*.cmd -.depend diff --git a/lustre/portals/knals/Makefile.in b/lustre/portals/knals/Makefile.in deleted file mode 100644 index 7e2e601..0000000 --- a/lustre/portals/knals/Makefile.in +++ /dev/null @@ -1,9 +0,0 @@ -@BUILD_GMNAL_TRUE@subdir-m += gmnal -@BUILD_RANAL_TRUE@subdir-m += ranal -@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal -@BUILD_IIBNAL_TRUE@subdir-m += iibnal -@BUILD_QSWNAL_TRUE@subdir-m += qswnal -subdir-m += socknal -subdir-m += lonal - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/Makefile.mk b/lustre/portals/knals/Makefile.mk deleted file mode 100644 index 454ee16..0000000 --- a/lustre/portals/knals/Makefile.mk +++ /dev/null @@ -1,5 +0,0 @@ -include $(obj)/../Kernelenv - -obj-y = socknal/ -obj-y = lonal/ -# more coming... diff --git a/lustre/portals/knals/autoMakefile.am b/lustre/portals/knals/autoMakefile.am deleted file mode 100644 index 4638188..0000000 --- a/lustre/portals/knals/autoMakefile.am +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -SUBDIRS = gmnal iibnal openibnal qswnal socknal lonal ranal diff --git a/lustre/portals/knals/gmnal/.cvsignore b/lustre/portals/knals/gmnal/.cvsignore deleted file mode 100644 index 642e2e6..0000000 --- a/lustre/portals/knals/gmnal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.cmd -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/gmnal/Makefile.in b/lustre/portals/knals/gmnal/Makefile.in deleted file mode 100644 index 89ea361..0000000 --- a/lustre/portals/knals/gmnal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kgmnal -kgmnal-objs := gmnal_api.o gmnal_cb.o gmnal_comm.o gmnal_utils.o gmnal_module.o - -EXTRA_PRE_CFLAGS := @GMCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/gmnal/Makefile.mk b/lustre/portals/knals/gmnal/Makefile.mk deleted file mode 100644 index b799a47..0000000 --- a/lustre/portals/knals/gmnal/Makefile.mk +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include ../../Kernelenv - -obj-y += gmnal.o -gmnal-objs := gmnal_api.o gmnal_cb.o gmnal_utils.o gmnal_comm.o gmnal_module.o - diff --git a/lustre/portals/knals/gmnal/autoMakefile.am b/lustre/portals/knals/gmnal/autoMakefile.am deleted file mode 100644 index d8b9edb..0000000 --- a/lustre/portals/knals/gmnal/autoMakefile.am +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if BUILD_GMNAL -if !CRAY_PORTALS -modulenet_DATA = kgmnal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kgmnal-objs:%.o=%.c) gmnal.h diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h deleted file mode 100644 index f45eab7..0000000 --- a/lustre/portals/knals/gmnal/gmnal.h +++ /dev/null @@ -1,455 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -/* - * Portals GM kernel NAL header file - * This file makes all declaration and prototypes - * for the API side and CB side of the NAL - */ -#ifndef __INCLUDE_GMNAL_H__ -#define __INCLUDE_GMNAL_H__ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include "linux/config.h" -#include "linux/module.h" -#include "linux/tty.h" -#include "linux/kernel.h" -#include "linux/mm.h" -#include "linux/string.h" -#include "linux/stat.h" -#include "linux/errno.h" -#include "linux/locks.h" -#include "linux/unistd.h" -#include "linux/init.h" -#include "linux/sem.h" -#include "linux/vmalloc.h" -#include "linux/sysctl.h" - -#define DEBUG_SUBSYSTEM S_NAL - -#include "portals/nal.h" -#include "portals/api.h" -#include "portals/errno.h" -#include "linux/kp30.h" -#include "portals/p30.h" - -#include "portals/nal.h" -#include "portals/lib-p30.h" - -#define GM_STRONG_TYPES 1 -#ifdef VERSION -#undef VERSION -#endif -#include "gm.h" -#include "gm_internal.h" - - - -/* - * Defines for the API NAL - */ - -/* - * Small message size is configurable - * insmod can set small_msg_size - * which is used to populate nal_data.small_msg_size - */ -#define GMNAL_SMALL_MESSAGE 1078 -#define GMNAL_LARGE_MESSAGE_INIT 1079 -#define GMNAL_LARGE_MESSAGE_ACK 1080 -#define GMNAL_LARGE_MESSAGE_FINI 1081 - -extern int gmnal_small_msg_size; -extern int num_rx_threads; -extern int num_stxds; -extern int gm_port; -#define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size -#define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c) -#define GMNAL_MAGIC 0x1234abcd -/* - * The gm_port to use for gmnal - */ -#define GMNAL_GM_PORT gm_port - - -/* - * Small Transmit Descriptor - * A structre to keep track of a small transmit operation - * This structure has a one-to-one relationship with a small - * transmit buffer (both create by gmnal_stxd_alloc). - * There are two free list of stxd. One for use by clients of the NAL - * and the other by the NAL rxthreads when doing sends. - * This helps prevent deadlock caused by stxd starvation. - */ -typedef struct _gmnal_stxd_t { - void *buffer; - int buffer_size; - gm_size_t gm_size; - int msg_size; - int gm_target_node; - int gm_priority; - int type; - struct _gmnal_data_t *nal_data; - lib_msg_t *cookie; - int niov; - struct iovec iov[PTL_MD_MAX_IOV]; - struct _gmnal_stxd_t *next; - int rxt; - int kniov; - struct iovec *iovec_dup; -} gmnal_stxd_t; - -/* - * keeps a transmit token for large transmit (gm_get) - * and a pointer to rxd that is used as context for large receive - */ -typedef struct _gmnal_ltxd_t { - struct _gmnal_ltxd_t *next; - struct _gmnal_srxd_t *srxd; -} gmnal_ltxd_t; - - -/* - * as for gmnal_stxd_t - * a hash table in nal_data find srxds from - * the rx buffer address. hash table populated at init time - */ -typedef struct _gmnal_srxd_t { - void *buffer; - int size; - gm_size_t gmsize; - unsigned int gm_source_node; - gmnal_stxd_t *source_stxd; - int type; - int nsiov; - int nriov; - struct iovec *riov; - int ncallbacks; - spinlock_t callback_lock; - int callback_status; - lib_msg_t *cookie; - struct _gmnal_srxd_t *next; - struct _gmnal_data_t *nal_data; -} gmnal_srxd_t; - -/* - * Header which lmgnal puts at the start of each message - */ -typedef struct _gmnal_msghdr { - int magic; - int type; - unsigned int sender_node_id; - gmnal_stxd_t *stxd; - int niov; - } gmnal_msghdr_t; -#define GMNAL_MSGHDR_SIZE sizeof(gmnal_msghdr_t) - -/* - * the caretaker thread (ct_thread) gets receive events - * (and other events) from the myrinet device via the GM2 API. - * caretaker thread populates one work entry for each receive event, - * puts it on a Q in nal_data and wakes a receive thread to - * process the receive. - * Processing a portals receive can involve a transmit operation. - * Because of this the caretaker thread cannot process receives - * as it may get deadlocked when supply of transmit descriptors - * is exhausted (as caretaker thread is responsible for replacing - * transmit descriptors on the free list) - */ -typedef struct _gmnal_rxtwe { - void *buffer; - unsigned snode; - unsigned sport; - unsigned type; - unsigned length; - struct _gmnal_rxtwe *next; -} gmnal_rxtwe_t; - -/* - * 1 receive thread started on each CPU - */ -#define NRXTHREADS 10 /* max number of receiver threads */ - -typedef struct _gmnal_data_t { - spinlock_t stxd_lock; - struct semaphore stxd_token; - gmnal_stxd_t *stxd; - spinlock_t rxt_stxd_lock; - struct semaphore rxt_stxd_token; - gmnal_stxd_t *rxt_stxd; - spinlock_t ltxd_lock; - struct semaphore ltxd_token; - gmnal_ltxd_t *ltxd; - spinlock_t srxd_lock; - struct semaphore srxd_token; - gmnal_srxd_t *srxd; - struct gm_hash *srxd_hash; - nal_t *nal; - lib_nal_t *libnal; - struct gm_port *gm_port; - unsigned int gm_local_nid; - unsigned int gm_global_nid; - spinlock_t gm_lock; - long rxthread_pid[NRXTHREADS]; - int rxthread_stop_flag; - spinlock_t rxthread_flag_lock; - long rxthread_flag; - long ctthread_pid; - int ctthread_flag; - gm_alarm_t ctthread_alarm; - int small_msg_size; - int small_msg_gmsize; - gmnal_rxtwe_t *rxtwe_head; - gmnal_rxtwe_t *rxtwe_tail; - spinlock_t rxtwe_lock; - struct semaphore rxtwe_wait; - struct ctl_table_header *sysctl; -} gmnal_data_t; - -/* - * Flags to start/stop and check status of threads - * each rxthread sets 1 bit (any bit) of the flag on startup - * and clears 1 bit when exiting - */ -#define GMNAL_THREAD_RESET 0 -#define GMNAL_THREAD_STOP 666 -#define GMNAL_CTTHREAD_STARTED 333 -#define GMNAL_RXTHREADS_STARTED ( (1<stxd_lock); -#define GMNAL_TXD_LOCK(a) spin_lock(&a->stxd_lock); -#define GMNAL_TXD_UNLOCK(a) spin_unlock(&a->stxd_lock); -#define GMNAL_TXD_TOKEN_INIT(a, n) sema_init(&a->stxd_token, n); -#define GMNAL_TXD_GETTOKEN(a) down(&a->stxd_token); -#define GMNAL_TXD_TRYGETTOKEN(a) down_trylock(&a->stxd_token) -#define GMNAL_TXD_RETURNTOKEN(a) up(&a->stxd_token); - -#define GMNAL_RXT_TXD_LOCK_INIT(a) spin_lock_init(&a->rxt_stxd_lock); -#define GMNAL_RXT_TXD_LOCK(a) spin_lock(&a->rxt_stxd_lock); -#define GMNAL_RXT_TXD_UNLOCK(a) spin_unlock(&a->rxt_stxd_lock); -#define GMNAL_RXT_TXD_TOKEN_INIT(a, n) sema_init(&a->rxt_stxd_token, n); -#define GMNAL_RXT_TXD_GETTOKEN(a) down(&a->rxt_stxd_token); -#define GMNAL_RXT_TXD_TRYGETTOKEN(a) down_trylock(&a->rxt_stxd_token) -#define GMNAL_RXT_TXD_RETURNTOKEN(a) up(&a->rxt_stxd_token); - -#define GMNAL_LTXD_LOCK_INIT(a) spin_lock_init(&a->ltxd_lock); -#define GMNAL_LTXD_LOCK(a) spin_lock(&a->ltxd_lock); -#define GMNAL_LTXD_UNLOCK(a) spin_unlock(&a->ltxd_lock); -#define GMNAL_LTXD_TOKEN_INIT(a, n) sema_init(&a->ltxd_token, n); -#define GMNAL_LTXD_GETTOKEN(a) down(&a->ltxd_token); -#define GMNAL_LTXD_TRYGETTOKEN(a) down_trylock(&a->ltxd_token) -#define GMNAL_LTXD_RETURNTOKEN(a) up(&a->ltxd_token); - -#define GMNAL_RXD_LOCK_INIT(a) spin_lock_init(&a->srxd_lock); -#define GMNAL_RXD_LOCK(a) spin_lock(&a->srxd_lock); -#define GMNAL_RXD_UNLOCK(a) spin_unlock(&a->srxd_lock); -#define GMNAL_RXD_TOKEN_INIT(a, n) sema_init(&a->srxd_token, n); -#define GMNAL_RXD_GETTOKEN(a) down(&a->srxd_token); -#define GMNAL_RXD_TRYGETTOKEN(a) down_trylock(&a->srxd_token) -#define GMNAL_RXD_RETURNTOKEN(a) up(&a->srxd_token); - -#define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock); -#define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock); -#define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock); - - -/* - * Memory Allocator - */ - -/* - * API NAL - */ -int gmnal_api_startup(nal_t *, ptl_pid_t, - ptl_ni_limits_t *, ptl_ni_limits_t *); - -int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t); - -void gmnal_api_shutdown(nal_t *); - -int gmnal_api_validate(nal_t *, void *, size_t); - -void gmnal_api_yield(nal_t *, unsigned long *, int); - -void gmnal_api_lock(nal_t *, unsigned long *); - -void gmnal_api_unlock(nal_t *, unsigned long *); - - -#define GMNAL_INIT_NAL(a) do { \ - (a)->nal_ni_init = gmnal_api_startup; \ - (a)->nal_ni_fini = gmnal_api_shutdown; \ - (a)->nal_data = NULL; \ - } while (0) - - -/* - * CB NAL - */ - -ptl_err_t gmnal_cb_send(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t, size_t); - -ptl_err_t gmnal_cb_send_pages(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t, size_t); - -ptl_err_t gmnal_cb_recv(lib_nal_t *, void *, lib_msg_t *, - unsigned int, struct iovec *, size_t, size_t, size_t); - -ptl_err_t gmnal_cb_recv_pages(lib_nal_t *, void *, lib_msg_t *, - unsigned int, ptl_kiov_t *, size_t, size_t, size_t); - -int gmnal_cb_dist(lib_nal_t *, ptl_nid_t, unsigned long *); - -int gmnal_init(void); - -void gmnal_fini(void); - - - -#define GMNAL_INIT_NAL_CB(a) do { \ - a->libnal_send = gmnal_cb_send; \ - a->libnal_send_pages = gmnal_cb_send_pages; \ - a->libnal_recv = gmnal_cb_recv; \ - a->libnal_recv_pages = gmnal_cb_recv_pages; \ - a->libnal_map = NULL; \ - a->libnal_unmap = NULL; \ - a->libnal_dist = gmnal_cb_dist; \ - a->libnal_data = NULL; \ - } while (0) - - -/* - * Small and Large Transmit and Receive Descriptor Functions - */ -int gmnal_alloc_txd(gmnal_data_t *); -void gmnal_free_txd(gmnal_data_t *); -gmnal_stxd_t* gmnal_get_stxd(gmnal_data_t *, int); -void gmnal_return_stxd(gmnal_data_t *, gmnal_stxd_t *); -gmnal_ltxd_t* gmnal_get_ltxd(gmnal_data_t *); -void gmnal_return_ltxd(gmnal_data_t *, gmnal_ltxd_t *); - -int gmnal_alloc_srxd(gmnal_data_t *); -void gmnal_free_srxd(gmnal_data_t *); -gmnal_srxd_t* gmnal_get_srxd(gmnal_data_t *, int); -void gmnal_return_srxd(gmnal_data_t *, gmnal_srxd_t *); - -/* - * general utility functions - */ -gmnal_srxd_t *gmnal_rxbuffer_to_srxd(gmnal_data_t *, void*); -void gmnal_stop_rxthread(gmnal_data_t *); -void gmnal_stop_ctthread(gmnal_data_t *); -void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); -void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t); -void gmnal_resume_sending_callback(gm_port_t *, void *, gm_status_t); -char *gmnal_gm_error(gm_status_t); -char *gmnal_rxevent(gm_recv_event_t*); -int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int); -void gmnal_yield(int); -int gmnal_start_kernel_threads(gmnal_data_t *); - - -/* - * Communication functions - */ - -/* - * Receive threads - */ -int gmnal_ct_thread(void *); /* caretaker thread */ -int gmnal_rx_thread(void *); /* receive thread */ -int gmnal_pre_receive(gmnal_data_t*, gmnal_rxtwe_t*, int); -int gmnal_rx_bad(gmnal_data_t *, gmnal_rxtwe_t *, gmnal_srxd_t*); -int gmnal_rx_requeue_buffer(gmnal_data_t *, gmnal_srxd_t *); -int gmnal_add_rxtwe(gmnal_data_t *, gm_recv_t *); -gmnal_rxtwe_t * gmnal_get_rxtwe(gmnal_data_t *); -void gmnal_remove_rxtwe(gmnal_data_t *); - - -/* - * Small messages - */ -int gmnal_small_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, - struct iovec *, size_t, size_t, size_t); -int gmnal_small_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, - unsigned int, struct iovec*, size_t, int); -void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); - - - -/* - * Large messages - */ -int gmnal_large_rx(lib_nal_t *, void *, lib_msg_t *, unsigned int, - struct iovec *, size_t, size_t, size_t); - -int gmnal_large_tx(lib_nal_t *, void *, lib_msg_t *, ptl_hdr_t *, - int, ptl_nid_t, ptl_pid_t, unsigned int, - struct iovec*, size_t, int); - -void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t); - -int gmnal_remote_get(gmnal_srxd_t *, int, struct iovec*, int, - struct iovec*); - -void gmnal_remote_get_callback(gm_port_t *, void *, gm_status_t); - -int gmnal_copyiov(int, gmnal_srxd_t *, int, struct iovec*, int, - struct iovec*); - -void gmnal_large_tx_ack(gmnal_data_t *, gmnal_srxd_t *); -void gmnal_large_tx_ack_callback(gm_port_t *, void *, gm_status_t); -void gmnal_large_tx_ack_received(gmnal_data_t *, gmnal_srxd_t *); - -#endif /*__INCLUDE_GMNAL_H__*/ diff --git a/lustre/portals/knals/gmnal/gmnal_api.c b/lustre/portals/knals/gmnal/gmnal_api.c deleted file mode 100644 index bd6c83e..0000000 --- a/lustre/portals/knals/gmnal/gmnal_api.c +++ /dev/null @@ -1,424 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * Implements the API NAL functions - */ - -#include "gmnal.h" - - - -gmnal_data_t *global_nal_data = NULL; -#define GLOBAL_NID_STR_LEN 16 -char global_nid_str[GLOBAL_NID_STR_LEN] = {0}; -ptl_handle_ni_t kgmnal_ni; - -extern int gmnal_cmd(struct portals_cfg *pcfg, void *private); - -/* - * Write the global nid /proc/sys/gmnal/globalnid - */ -#define GMNAL_SYSCTL 201 -#define GMNAL_SYSCTL_GLOBALNID 1 - -static ctl_table gmnal_sysctl_table[] = { - {GMNAL_SYSCTL_GLOBALNID, "globalnid", - global_nid_str, GLOBAL_NID_STR_LEN, - 0444, NULL, &proc_dostring}, - { 0 } -}; - - -static ctl_table gmnalnal_top_sysctl_table[] = { - {GMNAL_SYSCTL, "gmnal", NULL, 0, 0555, gmnal_sysctl_table}, - { 0 } -}; - -/* - * gmnal_api_shutdown - * nal_refct == 0 => called on last matching PtlNIFini() - * Close down this interface and free any resources associated with it - * nal_t nal our nal to shutdown - */ -void -gmnal_api_shutdown(nal_t *nal) -{ - gmnal_data_t *nal_data; - lib_nal_t *libnal; - - if (nal->nal_refct != 0) - return; - - - LASSERT(nal == global_nal_data->nal); - libnal = (lib_nal_t *)nal->nal_data; - nal_data = (gmnal_data_t *)libnal->libnal_data; - LASSERT(nal_data == global_nal_data); - CDEBUG(D_TRACE, "gmnal_api_shutdown: nal_data [%p]\n", nal_data); - - /* Stop portals calling our ioctl handler */ - libcfs_nal_cmd_unregister(GMNAL); - - /* XXX for shutdown "under fire" we probably need to set a shutdown - * flag so when lib calls us we fail immediately and dont queue any - * more work but our threads can still call into lib OK. THEN - * shutdown our threads, THEN lib_fini() */ - lib_fini(libnal); - - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - if (nal_data->sysctl) - unregister_sysctl_table (nal_data->sysctl); - /* Don't free 'nal'; it's a static struct */ - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - - global_nal_data = NULL; - PORTAL_MODULE_UNUSE; -} - - -int -gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - - lib_nal_t *libnal = NULL; - gmnal_data_t *nal_data = NULL; - gmnal_srxd_t *srxd = NULL; - gm_status_t gm_status; - unsigned int local_nid = 0, global_nid = 0; - ptl_process_id_t process_id; - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) { - libnal = (lib_nal_t *)nal->nal_data; - *actual_limits = libnal->libnal_ni.ni_actual_limits; - } - return (PTL_OK); - } - - /* Called on first PtlNIInit() */ - - CDEBUG(D_TRACE, "startup\n"); - - LASSERT(global_nal_data == NULL); - - PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t)); - if (!nal_data) { - CDEBUG(D_ERROR, "can't get memory\n"); - return(PTL_NO_SPACE); - } - memset(nal_data, 0, sizeof(gmnal_data_t)); - /* - * set the small message buffer size - */ - - CDEBUG(D_INFO, "Allocd and reset nal_data[%p]\n", nal_data); - CDEBUG(D_INFO, "small_msg_size is [%d]\n", nal_data->small_msg_size); - - PORTAL_ALLOC(libnal, sizeof(lib_nal_t)); - if (!libnal) { - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - return(PTL_NO_SPACE); - } - memset(libnal, 0, sizeof(lib_nal_t)); - CDEBUG(D_INFO, "Allocd and reset libnal[%p]\n", libnal); - - GMNAL_INIT_NAL_CB(libnal); - /* - * String them all together - */ - libnal->libnal_data = (void*)nal_data; - nal_data->nal = nal; - nal_data->libnal = libnal; - - GMNAL_GM_LOCK_INIT(nal_data); - - - /* - * initialise the interface, - */ - CDEBUG(D_INFO, "Calling gm_init\n"); - if (gm_init() != GM_SUCCESS) { - CDEBUG(D_ERROR, "call to gm_init failed\n"); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - - CDEBUG(D_NET, "Calling gm_open with port [%d], " - "name [%s], version [%d]\n", GMNAL_GM_PORT, - "gmnal", GM_API_VERSION); - - GMNAL_GM_LOCK(nal_data); - gm_status = gm_open(&nal_data->gm_port, 0, GMNAL_GM_PORT, "gmnal", - GM_API_VERSION); - GMNAL_GM_UNLOCK(nal_data); - - CDEBUG(D_INFO, "gm_open returned [%d]\n", gm_status); - if (gm_status == GM_SUCCESS) { - CDEBUG(D_INFO, "gm_open succeeded port[%p]\n", - nal_data->gm_port); - } else { - switch(gm_status) { - case(GM_INVALID_PARAMETER): - CDEBUG(D_ERROR, "gm_open Failure. Invalid Parameter\n"); - break; - case(GM_BUSY): - CDEBUG(D_ERROR, "gm_open Failure. GM Busy\n"); - break; - case(GM_NO_SUCH_DEVICE): - CDEBUG(D_ERROR, "gm_open Failure. No such device\n"); - break; - case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib " - "and driver\n"); - break; - case(GM_OUT_OF_MEMORY): - CDEBUG(D_ERROR, "gm_open Failure. Out of Memory\n"); - break; - default: - CDEBUG(D_ERROR, "gm_open Failure. Unknow error " - "code [%d]\n", gm_status); - break; - } - GMNAL_GM_LOCK(nal_data); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - - nal_data->small_msg_size = gmnal_small_msg_size; - nal_data->small_msg_gmsize = - gm_min_size_for_length(gmnal_small_msg_size); - - if (gmnal_alloc_srxd(nal_data) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "Failed to allocate small rx descriptors\n"); - gmnal_free_txd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - - /* - * Hang out a bunch of small receive buffers - * In fact hang them all out - */ - while((srxd = gmnal_get_srxd(nal_data, 0))) { - CDEBUG(D_NET, "giving [%p] to gm_provide_recvive_buffer\n", - srxd->buffer); - GMNAL_GM_LOCK(nal_data); - gm_provide_receive_buffer_with_tag(nal_data->gm_port, - srxd->buffer, srxd->gmsize, - GM_LOW_PRIORITY, 0); - GMNAL_GM_UNLOCK(nal_data); - } - - /* - * Allocate pools of small tx buffers and descriptors - */ - if (gmnal_alloc_txd(nal_data) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "Failed to allocate small tx descriptors\n"); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - gmnal_start_kernel_threads(nal_data); - - while (nal_data->rxthread_flag != GMNAL_RXTHREADS_STARTED) { - gmnal_yield(1); - CDEBUG(D_INFO, "Waiting for receive thread signs of life\n"); - } - - CDEBUG(D_INFO, "receive thread seems to have started\n"); - - - /* - * Initialise the portals library - */ - CDEBUG(D_NET, "Getting node id\n"); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_get_node_id(nal_data->gm_port, &local_nid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - CDEBUG(D_ERROR, "can't determine node id\n"); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - nal_data->gm_local_nid = local_nid; - CDEBUG(D_INFO, "Local node id is [%u]\n", local_nid); - - GMNAL_GM_LOCK(nal_data); - gm_status = gm_node_id_to_global_id(nal_data->gm_port, local_nid, - &global_nid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "failed to obtain global id\n"); - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); - nal_data->gm_global_nid = global_nid; - snprintf(global_nid_str, GLOBAL_NID_STR_LEN, "%u", global_nid); - -/* - pid = gm_getpid(); -*/ - process_id.pid = requested_pid; - process_id.nid = global_nid; - - CDEBUG(D_INFO, "portals_pid is [%u]\n", process_id.pid); - CDEBUG(D_INFO, "portals_nid is ["LPU64"]\n", process_id.nid); - - CDEBUG(D_PORTALS, "calling lib_init\n"); - if (lib_init(libnal, nal, process_id, - requested_limits, actual_limits) != PTL_OK) { - CDEBUG(D_ERROR, "lib_init failed\n"); - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - - } - - if (libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data) != 0) { - CDEBUG(D_INFO, "libcfs_nal_cmd_register failed\n"); - - /* XXX these cleanup cases should be restructured to - * minimise duplication... */ - lib_fini(libnal); - - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - gmnal_free_txd(nal_data); - gmnal_free_srxd(nal_data); - GMNAL_GM_LOCK(nal_data); - gm_close(nal_data->gm_port); - gm_finalize(); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); - PORTAL_FREE(libnal, sizeof(lib_nal_t)); - return(PTL_FAIL); - } - - /* might be better to initialise this at module load rather than in - * NAL startup */ - nal_data->sysctl = NULL; - nal_data->sysctl = register_sysctl_table (gmnalnal_top_sysctl_table, 0); - - - CDEBUG(D_INFO, "gmnal_init finished\n"); - global_nal_data = nal->nal_data; - - /* no unload now until shutdown */ - PORTAL_MODULE_USE; - - return(PTL_OK); -} - -nal_t the_gm_nal; - -/* - * Called when module loaded - */ -int gmnal_init(void) -{ - int rc; - - memset(&the_gm_nal, 0, sizeof(nal_t)); - CDEBUG(D_INFO, "reset nal[%p]\n", &the_gm_nal); - GMNAL_INIT_NAL(&the_gm_nal); - - rc = ptl_register_nal(GMNAL, &the_gm_nal); - if (rc != PTL_OK) - CERROR("Can't register GMNAL: %d\n", rc); - rc = PtlNIInit(GMNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kgmnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(GMNAL); - return (-ENODEV); - } - - return (rc); -} - - - -/* - * Called when module removed - */ -void gmnal_fini() -{ - CDEBUG(D_TRACE, "gmnal_fini\n"); - - LASSERT(global_nal_data == NULL); - PtlNIFini(kgmnal_ni); - - ptl_unregister_nal(GMNAL); -} diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c deleted file mode 100644 index 0ebf437..0000000 --- a/lustre/portals/knals/gmnal/gmnal_cb.c +++ /dev/null @@ -1,207 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -/* - * This file implements the nal cb functions - */ - - -#include "gmnal.h" - -ptl_err_t gmnal_cb_recv(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - unsigned int niov, struct iovec *iov, size_t offset, - size_t mlen, size_t rlen) -{ - gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; - int status = PTL_OK; - - - CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], cookie[%p], " - "niov[%d], iov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", - libnal, private, cookie, niov, iov, offset, mlen, rlen); - - switch(srxd->type) { - case(GMNAL_SMALL_MESSAGE): - CDEBUG(D_INFO, "gmnal_cb_recv got small message\n"); - status = gmnal_small_rx(libnal, private, cookie, niov, - iov, offset, mlen, rlen); - break; - case(GMNAL_LARGE_MESSAGE_INIT): - CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n"); - status = gmnal_large_rx(libnal, private, cookie, niov, - iov, offset, mlen, rlen); - } - - - CDEBUG(D_INFO, "gmnal_cb_recv gmnal_return status [%d]\n", status); - return(status); -} - -ptl_err_t gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - unsigned int kniov, ptl_kiov_t *kiov, size_t offset, - size_t mlen, size_t rlen) -{ - gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; - int status = PTL_OK; - struct iovec *iovec = NULL, *iovec_dup = NULL; - int i = 0; - ptl_kiov_t *kiov_dup = kiov;; - - - CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], " - "cookie[%p], kniov[%d], kiov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", - libnal, private, cookie, kniov, kiov, offset, mlen, rlen); - - if (srxd->type == GMNAL_SMALL_MESSAGE) { - PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov); - if (!iovec) { - CDEBUG(D_ERROR, "Can't malloc\n"); - return(GMNAL_STATUS_FAIL); - } - iovec_dup = iovec; - - /* - * map each page and create an iovec for it - */ - for (i=0; ikiov_page, kiov->kiov_len, - kiov->kiov_offset); - iovec->iov_len = kiov->kiov_len; - CDEBUG(D_INFO, "Calling kmap[%p]", kiov->kiov_page); - - iovec->iov_base = kmap(kiov->kiov_page) + - kiov->kiov_offset; - - CDEBUG(D_INFO, "iov_base is [%p]\n", iovec->iov_base); - iovec++; - kiov++; - } - CDEBUG(D_INFO, "calling gmnal_small_rx\n"); - status = gmnal_small_rx(libnal, private, cookie, kniov, - iovec_dup, offset, mlen, rlen); - for (i=0; ikiov_page); - kiov_dup++; - } - PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov); - } - - - CDEBUG(D_INFO, "gmnal_return status [%d]\n", status); - return(status); -} - - -ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t offset, size_t len) -{ - - gmnal_data_t *nal_data; - - - CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] offset["LPSZ"] len["LPSZ"] nid["LPU64"]\n", - niov, offset, len, nid); - nal_data = libnal->libnal_data; - - if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) { - CDEBUG(D_INFO, "This is a small message send\n"); - gmnal_small_tx(libnal, private, cookie, hdr, type, nid, pid, - niov, iov, offset, len); - } else { - CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(libnal, private, cookie, PTL_FAIL); - return(PTL_FAIL); - gmnal_large_tx(libnal, private, cookie, hdr, type, nid, pid, - niov, iov, offset, len); - } - return(PTL_OK); -} - -ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int kniov, ptl_kiov_t *kiov, size_t offset, size_t len) -{ - - int i = 0; - gmnal_data_t *nal_data; - struct iovec *iovec = NULL, *iovec_dup = NULL; - ptl_kiov_t *kiov_dup = kiov; - - CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] offset["LPSZ"] len["LPSZ"]\n", - nid, kniov, offset, len); - nal_data = libnal->libnal_data; - PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec)); - iovec_dup = iovec; - if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) { - CDEBUG(D_INFO, "This is a small message send\n"); - - for (i=0; ikiov_page, kiov->kiov_len, - kiov->kiov_offset); - - iovec->iov_base = kmap(kiov->kiov_page) - + kiov->kiov_offset; - - iovec->iov_len = kiov->kiov_len; - iovec++; - kiov++; - } - gmnal_small_tx(libnal, private, cookie, hdr, type, nid, - pid, kniov, iovec_dup, offset, len); - } else { - CDEBUG(D_ERROR, "Large message send it is not supported yet\n"); - return(PTL_FAIL); - for (i=0; ikiov_page, kiov->kiov_len, - kiov->kiov_offset); - - iovec->iov_base = kmap(kiov->kiov_page) - + kiov->kiov_offset; - iovec->iov_len = kiov->kiov_len; - iovec++; - kiov++; - } - gmnal_large_tx(libnal, private, cookie, hdr, type, nid, - pid, kniov, iovec, offset, len); - } - for (i=0; ikiov_page); - kiov_dup++; - } - PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec)); - return(PTL_OK); -} - -int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist) -{ - CDEBUG(D_TRACE, "gmnal_cb_dist\n"); - if (dist) - *dist = 27; - return(PTL_OK); -} diff --git a/lustre/portals/knals/gmnal/gmnal_comm.c b/lustre/portals/knals/gmnal/gmnal_comm.c deleted file mode 100644 index 6a8fcbc..0000000 --- a/lustre/portals/knals/gmnal/gmnal_comm.c +++ /dev/null @@ -1,1380 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * This file contains all gmnal send and receive functions - */ - -#include "gmnal.h" - -/* - * The caretaker thread - * This is main thread of execution for the NAL side - * This guy waits in gm_blocking_recvive and gets - * woken up when the myrinet adaptor gets an interrupt. - * Hands off receive operations to the receive thread - * This thread Looks after gm_callbacks etc inline. - */ -int -gmnal_ct_thread(void *arg) -{ - gmnal_data_t *nal_data; - gm_recv_event_t *rxevent = NULL; - gm_recv_t *recv = NULL; - - if (!arg) { - CDEBUG(D_TRACE, "NO nal_data. Exiting\n"); - return(-1); - } - - nal_data = (gmnal_data_t*)arg; - CDEBUG(D_TRACE, "nal_data is [%p]\n", arg); - - daemonize(); - - nal_data->ctthread_flag = GMNAL_CTTHREAD_STARTED; - - GMNAL_GM_LOCK(nal_data); - while(nal_data->ctthread_flag == GMNAL_CTTHREAD_STARTED) { - CDEBUG(D_NET, "waiting\n"); - rxevent = gm_blocking_receive_no_spin(nal_data->gm_port); - if (nal_data->ctthread_flag == GMNAL_THREAD_STOP) { - CDEBUG(D_INFO, "time to exit\n"); - break; - } - CDEBUG(D_INFO, "got [%s]\n", gmnal_rxevent(rxevent)); - switch (GM_RECV_EVENT_TYPE(rxevent)) { - - case(GM_RECV_EVENT): - CDEBUG(D_NET, "CTTHREAD:: GM_RECV_EVENT\n"); - recv = (gm_recv_t*)&rxevent->recv; - GMNAL_GM_UNLOCK(nal_data); - gmnal_add_rxtwe(nal_data, recv); - GMNAL_GM_LOCK(nal_data); - CDEBUG(D_NET, "CTTHREAD:: Added event to Q\n"); - break; - case(_GM_SLEEP_EVENT): - /* - * Blocking receive above just returns - * immediatly with _GM_SLEEP_EVENT - * Don't know what this is - */ - CDEBUG(D_NET, "Sleeping in gm_unknown\n"); - GMNAL_GM_UNLOCK(nal_data); - gm_unknown(nal_data->gm_port, rxevent); - GMNAL_GM_LOCK(nal_data); - CDEBUG(D_INFO, "Awake from gm_unknown\n"); - break; - - default: - /* - * Don't know what this is - * gm_unknown will make sense of it - * Should be able to do something with - * FAST_RECV_EVENTS here. - */ - CDEBUG(D_NET, "Passing event to gm_unknown\n"); - GMNAL_GM_UNLOCK(nal_data); - gm_unknown(nal_data->gm_port, rxevent); - GMNAL_GM_LOCK(nal_data); - CDEBUG(D_INFO, "Processed unknown event\n"); - } - } - GMNAL_GM_UNLOCK(nal_data); - nal_data->ctthread_flag = GMNAL_THREAD_RESET; - CDEBUG(D_INFO, "thread nal_data [%p] is exiting\n", nal_data); - return(GMNAL_STATUS_OK); -} - - -/* - * process a receive event - */ -int gmnal_rx_thread(void *arg) -{ - gmnal_data_t *nal_data; - void *buffer; - gmnal_rxtwe_t *we = NULL; - - if (!arg) { - CDEBUG(D_TRACE, "NO nal_data. Exiting\n"); - return(-1); - } - - nal_data = (gmnal_data_t*)arg; - CDEBUG(D_TRACE, "nal_data is [%p]\n", arg); - - daemonize(); - /* - * set 1 bit for each thread started - * doesn't matter which bit - */ - spin_lock(&nal_data->rxthread_flag_lock); - if (nal_data->rxthread_flag) - nal_data->rxthread_flag=nal_data->rxthread_flag*2 + 1; - else - nal_data->rxthread_flag = 1; - CDEBUG(D_INFO, "rxthread flag is [%ld]\n", nal_data->rxthread_flag); - spin_unlock(&nal_data->rxthread_flag_lock); - - while(nal_data->rxthread_stop_flag != GMNAL_THREAD_STOP) { - CDEBUG(D_NET, "RXTHREAD:: Receive thread waiting\n"); - we = gmnal_get_rxtwe(nal_data); - if (!we) { - CDEBUG(D_INFO, "Receive thread time to exit\n"); - break; - } - - buffer = we->buffer; - switch(((gmnal_msghdr_t*)buffer)->type) { - case(GMNAL_SMALL_MESSAGE): - gmnal_pre_receive(nal_data, we, - GMNAL_SMALL_MESSAGE); - break; - case(GMNAL_LARGE_MESSAGE_INIT): - gmnal_pre_receive(nal_data, we, - GMNAL_LARGE_MESSAGE_INIT); - break; - case(GMNAL_LARGE_MESSAGE_ACK): - gmnal_pre_receive(nal_data, we, - GMNAL_LARGE_MESSAGE_ACK); - break; - default: - CDEBUG(D_ERROR, "Unsupported message type\n"); - gmnal_rx_bad(nal_data, we, NULL); - } - PORTAL_FREE(we, sizeof(gmnal_rxtwe_t)); - } - - spin_lock(&nal_data->rxthread_flag_lock); - nal_data->rxthread_flag/=2; - CDEBUG(D_INFO, "rxthread flag is [%ld]\n", nal_data->rxthread_flag); - spin_unlock(&nal_data->rxthread_flag_lock); - CDEBUG(D_INFO, "thread nal_data [%p] is exiting\n", nal_data); - return(GMNAL_STATUS_OK); -} - - - -/* - * Start processing a small message receive - * Get here from gmnal_receive_thread - * Hand off to lib_parse, which calls cb_recv - * which hands back to gmnal_small_receive - * Deal with all endian stuff here. - */ -int -gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) -{ - gmnal_srxd_t *srxd = NULL; - void *buffer = NULL; - unsigned int snode, sport, type, length; - gmnal_msghdr_t *gmnal_msghdr; - ptl_hdr_t *portals_hdr; - int rc; - - CDEBUG(D_INFO, "nal_data [%p], we[%p] type [%d]\n", - nal_data, we, gmnal_type); - - buffer = we->buffer; - snode = we->snode; - sport = we->sport; - type = we->type; - buffer = we->buffer; - length = we->length; - - gmnal_msghdr = (gmnal_msghdr_t*)buffer; - portals_hdr = (ptl_hdr_t*)(buffer+GMNAL_MSGHDR_SIZE); - - CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d], " - "type [%d], length [%d], buffer [%p]\n", - snode, sport, type, length, buffer); - CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], " - "gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, - gmnal_msghdr->magic, gmnal_msghdr->type); - CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], " - "dest_node ["LPD64"]\n", portals_hdr->src_nid, - portals_hdr->dest_nid); - - - /* - * Get a receive descriptor for this message - */ - srxd = gmnal_rxbuffer_to_srxd(nal_data, buffer); - CDEBUG(D_INFO, "Back from gmnal_rxbuffer_to_srxd\n"); - if (!srxd) { - CDEBUG(D_ERROR, "Failed to get receive descriptor\n"); - /* I think passing a NULL srxd to lib_parse will crash - * gmnal_recv() */ - LBUG(); - lib_parse(nal_data->libnal, portals_hdr, srxd); - return(GMNAL_STATUS_FAIL); - } - - /* - * no need to bother portals library with this - */ - if (gmnal_type == GMNAL_LARGE_MESSAGE_ACK) { - gmnal_large_tx_ack_received(nal_data, srxd); - return(GMNAL_STATUS_OK); - } - - srxd->nal_data = nal_data; - srxd->type = gmnal_type; - srxd->nsiov = gmnal_msghdr->niov; - srxd->gm_source_node = gmnal_msghdr->sender_node_id; - - CDEBUG(D_PORTALS, "Calling lib_parse buffer is [%p]\n", - buffer+GMNAL_MSGHDR_SIZE); - /* - * control passes to lib, which calls cb_recv - * cb_recv is responsible for returning the buffer - * for future receive - */ - rc = lib_parse(nal_data->libnal, portals_hdr, srxd); - - if (rc != PTL_OK) { - /* I just received garbage; take appropriate action... */ - LBUG(); - } - - return(GMNAL_STATUS_OK); -} - - - -/* - * After a receive has been processed, - * hang out the receive buffer again. - * This implicitly returns a receive token. - */ -int -gmnal_rx_requeue_buffer(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) -{ - CDEBUG(D_TRACE, "gmnal_rx_requeue_buffer\n"); - - CDEBUG(D_NET, "requeueing srxd[%p] nal_data[%p]\n", srxd, nal_data); - - GMNAL_GM_LOCK(nal_data); - gm_provide_receive_buffer_with_tag(nal_data->gm_port, srxd->buffer, - srxd->gmsize, GM_LOW_PRIORITY, 0 ); - GMNAL_GM_UNLOCK(nal_data); - - return(GMNAL_STATUS_OK); -} - - -/* - * Handle a bad message - * A bad message is one we don't expect or can't interpret - */ -int -gmnal_rx_bad(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, gmnal_srxd_t *srxd) -{ - CDEBUG(D_TRACE, "Can't handle message\n"); - - if (!srxd) - srxd = gmnal_rxbuffer_to_srxd(nal_data, - we->buffer); - if (srxd) { - gmnal_rx_requeue_buffer(nal_data, srxd); - } else { - CDEBUG(D_ERROR, "Can't find a descriptor for this buffer\n"); - /* - * get rid of it ? - */ - return(GMNAL_STATUS_FAIL); - } - - return(GMNAL_STATUS_OK); -} - - - -/* - * Process a small message receive. - * Get here from gmnal_receive_thread, gmnal_pre_receive - * lib_parse, cb_recv - * Put data from prewired receive buffer into users buffer(s) - * Hang out the receive buffer again for another receive - * Call lib_finalize - */ -int -gmnal_small_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - unsigned int niov, struct iovec *iov, size_t offset, size_t mlen, size_t rlen) -{ - gmnal_srxd_t *srxd = NULL; - void *buffer = NULL; - gmnal_data_t *nal_data = (gmnal_data_t*)libnal->libnal_data; - - - CDEBUG(D_TRACE, "niov [%d] mlen["LPSZ"]\n", niov, mlen); - - if (!private) { - CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(libnal, private, cookie, PTL_FAIL); - return(PTL_FAIL); - } - - srxd = (gmnal_srxd_t*)private; - buffer = srxd->buffer; - buffer += sizeof(gmnal_msghdr_t); - buffer += sizeof(ptl_hdr_t); - - while(niov--) { - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - } else if (offset > 0) { - CDEBUG(D_INFO, "processing [%p] base [%p] len %d, " - "offset %d, len ["LPSZ"]\n", iov, - iov->iov_base + offset, iov->iov_len, offset, - iov->iov_len - offset); - gm_bcopy(buffer, iov->iov_base + offset, - iov->iov_len - offset); - offset = 0; - buffer += iov->iov_len - offset; - } else { - CDEBUG(D_INFO, "processing [%p] len ["LPSZ"]\n", iov, - iov->iov_len); - gm_bcopy(buffer, iov->iov_base, iov->iov_len); - buffer += iov->iov_len; - } - iov++; - } - - - /* - * let portals library know receive is complete - */ - CDEBUG(D_PORTALS, "calling lib_finalize\n"); - lib_finalize(libnal, private, cookie, PTL_OK); - /* - * return buffer so it can be used again - */ - CDEBUG(D_NET, "calling gm_provide_receive_buffer\n"); - GMNAL_GM_LOCK(nal_data); - gm_provide_receive_buffer_with_tag(nal_data->gm_port, srxd->buffer, - srxd->gmsize, GM_LOW_PRIORITY, 0); - GMNAL_GM_UNLOCK(nal_data); - - return(PTL_OK); -} - - -/* - * Start a small transmit. - * Get a send token (and wired transmit buffer). - * Copy data from senders buffer to wired buffer and - * initiate gm_send from the wired buffer. - * The callback function informs when the send is complete. - */ -int -gmnal_small_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t offset, int size) -{ - gmnal_data_t *nal_data = (gmnal_data_t*)libnal->libnal_data; - gmnal_stxd_t *stxd = NULL; - void *buffer = NULL; - gmnal_msghdr_t *msghdr = NULL; - int tot_size = 0; - unsigned int local_nid; - gm_status_t gm_status = GM_SUCCESS; - - CDEBUG(D_TRACE, "gmnal_small_tx libnal [%p] private [%p] cookie [%p] " - "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " - "iov [%p] size [%d]\n", libnal, private, cookie, hdr, type, - global_nid, pid, niov, iov, size); - - CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", - hdr->dest_nid, hdr->src_nid); - - if (!nal_data) { - CDEBUG(D_ERROR, "no nal_data\n"); - return(GMNAL_STATUS_FAIL); - } else { - CDEBUG(D_INFO, "nal_data [%p]\n", nal_data); - } - - GMNAL_GM_LOCK(nal_data); - gm_status = gm_global_id_to_node_id(nal_data->gm_port, global_nid, - &local_nid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "Failed to obtain local id\n"); - return(GMNAL_STATUS_FAIL); - } - CDEBUG(D_INFO, "Local Node_id is [%u][%x]\n", local_nid, local_nid); - - stxd = gmnal_get_stxd(nal_data, 1); - CDEBUG(D_INFO, "stxd [%p]\n", stxd); - - stxd->type = GMNAL_SMALL_MESSAGE; - stxd->cookie = cookie; - - /* - * Copy gmnal_msg_hdr and portals header to the transmit buffer - * Then copy the data in - */ - buffer = stxd->buffer; - msghdr = (gmnal_msghdr_t*)buffer; - - msghdr->magic = GMNAL_MAGIC; - msghdr->type = GMNAL_SMALL_MESSAGE; - msghdr->sender_node_id = nal_data->gm_global_nid; - CDEBUG(D_INFO, "processing msghdr at [%p]\n", buffer); - - buffer += sizeof(gmnal_msghdr_t); - - CDEBUG(D_INFO, "processing portals hdr at [%p]\n", buffer); - gm_bcopy(hdr, buffer, sizeof(ptl_hdr_t)); - - buffer += sizeof(ptl_hdr_t); - - while(niov--) { - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - } else if (offset > 0) { - CDEBUG(D_INFO, "processing iov [%p] base [%p] len ["LPSZ"] to [%p]\n", - iov, iov->iov_base + offset, iov->iov_len - offset, buffer); - gm_bcopy(iov->iov_base + offset, buffer, iov->iov_len - offset); - buffer+= iov->iov_len - offset; - offset = 0; - } else { - CDEBUG(D_INFO, "processing iov [%p] len ["LPSZ"] to [%p]\n", - iov, iov->iov_len, buffer); - gm_bcopy(iov->iov_base, buffer, iov->iov_len); - buffer+= iov->iov_len; - } - iov++; - } - - CDEBUG(D_INFO, "sending\n"); - tot_size = size+sizeof(ptl_hdr_t)+sizeof(gmnal_msghdr_t); - stxd->msg_size = tot_size; - - - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " - "gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] " - "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, - stxd->msg_size, global_nid, local_nid, stxd); - - GMNAL_GM_LOCK(nal_data); - stxd->gm_priority = GM_LOW_PRIORITY; - stxd->gm_target_node = local_nid; - gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, - stxd->gm_size, stxd->msg_size, - GM_LOW_PRIORITY, local_nid, - gmnal_small_tx_callback, (void*)stxd); - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "done\n"); - - return(PTL_OK); -} - - -/* - * A callback to indicate the small transmit operation is compete - * Check for erros and try to deal with them. - * Call lib_finalise to inform the client application that the send - * is complete and the memory can be reused. - * Return the stxd when finished with it (returns a send token) - */ -void -gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) -{ - gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; - lib_msg_t *cookie = stxd->cookie; - gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; - lib_nal_t *libnal = nal_data->libnal; - - if (!stxd) { - CDEBUG(D_TRACE, "send completion event for unknown stxd\n"); - return; - } - if (status != GM_SUCCESS) { - CDEBUG(D_ERROR, "Result of send stxd [%p] is [%s]\n", - stxd, gmnal_gm_error(status)); - } - - switch(status) { - case(GM_SUCCESS): - break; - - - - case(GM_SEND_DROPPED): - /* - * do a resend on the dropped ones - */ - CDEBUG(D_ERROR, "send stxd [%p] was dropped " - "resending\n", context); - GMNAL_GM_LOCK(nal_data); - gm_send_to_peer_with_callback(nal_data->gm_port, - stxd->buffer, - stxd->gm_size, - stxd->msg_size, - stxd->gm_priority, - stxd->gm_target_node, - gmnal_small_tx_callback, - context); - GMNAL_GM_UNLOCK(nal_data); - - return; - case(GM_TIMED_OUT): - case(GM_SEND_TIMED_OUT): - /* - * drop these ones - */ - CDEBUG(D_INFO, "calling gm_drop_sends\n"); - GMNAL_GM_LOCK(nal_data); - gm_drop_sends(nal_data->gm_port, stxd->gm_priority, - stxd->gm_target_node, GMNAL_GM_PORT, - gmnal_drop_sends_callback, context); - GMNAL_GM_UNLOCK(nal_data); - - return; - - - /* - * abort on these ? - */ - case(GM_TRY_AGAIN): - case(GM_INTERRUPTED): - case(GM_FAILURE): - case(GM_INPUT_BUFFER_TOO_SMALL): - case(GM_OUTPUT_BUFFER_TOO_SMALL): - case(GM_BUSY): - case(GM_MEMORY_FAULT): - case(GM_INVALID_PARAMETER): - case(GM_OUT_OF_MEMORY): - case(GM_INVALID_COMMAND): - case(GM_PERMISSION_DENIED): - case(GM_INTERNAL_ERROR): - case(GM_UNATTACHED): - case(GM_UNSUPPORTED_DEVICE): - case(GM_SEND_REJECTED): - case(GM_SEND_TARGET_PORT_CLOSED): - case(GM_SEND_TARGET_NODE_UNREACHABLE): - case(GM_SEND_PORT_CLOSED): - case(GM_NODE_ID_NOT_YET_SET): - case(GM_STILL_SHUTTING_DOWN): - case(GM_CLONE_BUSY): - case(GM_NO_SUCH_DEVICE): - case(GM_ABORTED): - case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - case(GM_UNTRANSLATED_SYSTEM_ERROR): - case(GM_ACCESS_DENIED): - case(GM_NO_DRIVER_SUPPORT): - case(GM_PTE_REF_CNT_OVERFLOW): - case(GM_NOT_SUPPORTED_IN_KERNEL): - case(GM_NOT_SUPPORTED_ON_ARCH): - case(GM_NO_MATCH): - case(GM_USER_ERROR): - case(GM_DATA_CORRUPTED): - case(GM_HARDWARE_FAULT): - case(GM_SEND_ORPHANED): - case(GM_MINOR_OVERFLOW): - case(GM_PAGE_TABLE_FULL): - case(GM_UC_ERROR): - case(GM_INVALID_PORT_NUMBER): - case(GM_DEV_NOT_FOUND): - case(GM_FIRMWARE_NOT_RUNNING): - case(GM_YP_NO_MATCH): - default: - CDEBUG(D_ERROR, "Unknown send error\n"); - gm_resume_sending(nal_data->gm_port, stxd->gm_priority, - stxd->gm_target_node, GMNAL_GM_PORT, - gmnal_resume_sending_callback, context); - return; - - } - - /* - * TO DO - * If this is a large message init, - * we're not finished with the data yet, - * so can't call lib_finalise. - * However, we're also holding on to a - * stxd here (to keep track of the source - * iovec only). Should use another structure - * to keep track of iovec and return stxd to - * free list earlier. - */ - if (stxd->type == GMNAL_LARGE_MESSAGE_INIT) { - CDEBUG(D_INFO, "large transmit done\n"); - return; - } - gmnal_return_stxd(nal_data, stxd); - lib_finalize(libnal, stxd, cookie, PTL_OK); - return; -} - -/* - * After an error on the port - * call this to allow future sends to complete - */ -void gmnal_resume_sending_callback(struct gm_port *gm_port, void *context, - gm_status_t status) -{ - gmnal_data_t *nal_data; - gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; - CDEBUG(D_TRACE, "status is [%d] context is [%p]\n", status, context); - gmnal_return_stxd(stxd->nal_data, stxd); - return; -} - - -void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, - gm_status_t status) -{ - gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; - gmnal_data_t *nal_data = stxd->nal_data; - - CDEBUG(D_TRACE, "status is [%d] context is [%p]\n", status, context); - if (status == GM_SUCCESS) { - GMNAL_GM_LOCK(nal_data); - gm_send_to_peer_with_callback(gm_port, stxd->buffer, - stxd->gm_size, stxd->msg_size, - stxd->gm_priority, - stxd->gm_target_node, - gmnal_small_tx_callback, - context); - GMNAL_GM_LOCK(nal_data); - } else { - CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is " - "[%d][%s]\n", stxd, status, gmnal_gm_error(status)); - } - - - return; -} - - -/* - * Begine a large transmit. - * Do a gm_register of the memory pointed to by the iovec - * and send details to the receiver. The receiver does a gm_get - * to pull the data and sends and ack when finished. Upon receipt of - * this ack, deregister the memory. Only 1 send token is required here. - */ -int -gmnal_large_tx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t global_nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t offset, int size) -{ - - gmnal_data_t *nal_data; - gmnal_stxd_t *stxd = NULL; - void *buffer = NULL; - gmnal_msghdr_t *msghdr = NULL; - unsigned int local_nid; - int mlen = 0; /* the size of the init message data */ - struct iovec *iov_dup = NULL; - gm_status_t gm_status; - int niov_dup; - - - CDEBUG(D_TRACE, "gmnal_large_tx libnal [%p] private [%p], cookie [%p] " - "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " - "iov [%p], size [%d]\n", libnal, private, cookie, hdr, type, - global_nid, pid, niov, iov, size); - - if (libnal) - nal_data = (gmnal_data_t*)libnal->libnal_data; - else { - CDEBUG(D_ERROR, "no libnal.\n"); - return(GMNAL_STATUS_FAIL); - } - - - /* - * Get stxd and buffer. Put local address of data in buffer, - * send local addresses to target, - * wait for the target node to suck the data over. - * The stxd is used to ren - */ - stxd = gmnal_get_stxd(nal_data, 1); - CDEBUG(D_INFO, "stxd [%p]\n", stxd); - - stxd->type = GMNAL_LARGE_MESSAGE_INIT; - stxd->cookie = cookie; - - /* - * Copy gmnal_msg_hdr and portals header to the transmit buffer - * Then copy the iov in - */ - buffer = stxd->buffer; - msghdr = (gmnal_msghdr_t*)buffer; - - CDEBUG(D_INFO, "processing msghdr at [%p]\n", buffer); - - msghdr->magic = GMNAL_MAGIC; - msghdr->type = GMNAL_LARGE_MESSAGE_INIT; - msghdr->sender_node_id = nal_data->gm_global_nid; - msghdr->stxd = stxd; - msghdr->niov = niov ; - buffer += sizeof(gmnal_msghdr_t); - mlen = sizeof(gmnal_msghdr_t); - CDEBUG(D_INFO, "mlen is [%d]\n", mlen); - - - CDEBUG(D_INFO, "processing portals hdr at [%p]\n", buffer); - - gm_bcopy(hdr, buffer, sizeof(ptl_hdr_t)); - buffer += sizeof(ptl_hdr_t); - mlen += sizeof(ptl_hdr_t); - CDEBUG(D_INFO, "mlen is [%d]\n", mlen); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - } - - LASSERT(offset >= 0); - /* - * Store the iovs in the stxd for we can get - * them later if we need them - */ - stxd->iov[0].iov_base = iov->iov_base + offset; - stxd->iov[0].iov_len = iov->iov_len - offset; - CDEBUG(D_NET, "Copying iov [%p] to [%p], niov=%d\n", iov, stxd->iov, niov); - if (niov > 1) - gm_bcopy(&iov[1], &stxd->iov[1], (niov-1)*sizeof(struct iovec)); - stxd->niov = niov; - - /* - * copy the iov to the buffer so target knows - * where to get the data from - */ - CDEBUG(D_INFO, "processing iov to [%p]\n", buffer); - gm_bcopy(stxd->iov, buffer, stxd->niov*sizeof(struct iovec)); - mlen += stxd->niov*(sizeof(struct iovec)); - CDEBUG(D_INFO, "mlen is [%d]\n", mlen); - - /* - * register the memory so the NIC can get hold of the data - * This is a slow process. it'd be good to overlap it - * with something else. - */ - iov = stxd->iov; - iov_dup = iov; - niov_dup = niov; - while(niov--) { - CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n", - iov->iov_base, iov->iov_len); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_register_memory(nal_data->gm_port, - iov->iov_base, iov->iov_len); - if (gm_status != GM_SUCCESS) { - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " - "for memory [%p] len ["LPSZ"]\n", - gm_status, gmnal_gm_error(gm_status), - iov->iov_base, iov->iov_len); - GMNAL_GM_LOCK(nal_data); - while (iov_dup != iov) { - gm_deregister_memory(nal_data->gm_port, - iov_dup->iov_base, - iov_dup->iov_len); - iov_dup++; - } - GMNAL_GM_UNLOCK(nal_data); - gmnal_return_stxd(nal_data, stxd); - return(PTL_FAIL); - } - - GMNAL_GM_UNLOCK(nal_data); - iov++; - } - - /* - * Send the init message to the target - */ - CDEBUG(D_INFO, "sending mlen [%d]\n", mlen); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_global_id_to_node_id(nal_data->gm_port, global_nid, - &local_nid); - if (gm_status != GM_SUCCESS) { - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "Failed to obtain local id\n"); - gmnal_return_stxd(nal_data, stxd); - /* TO DO deregister memory on failure */ - return(GMNAL_STATUS_FAIL); - } - CDEBUG(D_INFO, "Local Node_id is [%d]\n", local_nid); - gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, - stxd->gm_size, mlen, GM_LOW_PRIORITY, - local_nid, gmnal_large_tx_callback, - (void*)stxd); - GMNAL_GM_UNLOCK(nal_data); - - CDEBUG(D_INFO, "done\n"); - - return(PTL_OK); -} - -/* - * Callback function indicates that send of buffer with - * large message iovec has completed (or failed). - */ -void -gmnal_large_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) -{ - gmnal_small_tx_callback(gm_port, context, status); - -} - - - -/* - * Have received a buffer that contains an iovec of the sender. - * Do a gm_register_memory of the receivers buffer and then do a get - * data from the sender. - */ -int -gmnal_large_rx(lib_nal_t *libnal, void *private, lib_msg_t *cookie, - unsigned int nriov, struct iovec *riov, size_t offset, - size_t mlen, size_t rlen) -{ - gmnal_data_t *nal_data = libnal->libnal_data; - gmnal_srxd_t *srxd = (gmnal_srxd_t*)private; - void *buffer = NULL; - struct iovec *riov_dup; - int nriov_dup; - gmnal_msghdr_t *msghdr = NULL; - gm_status_t gm_status; - - CDEBUG(D_TRACE, "gmnal_large_rx :: libnal[%p], private[%p], " - "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", - libnal, private, cookie, nriov, riov, mlen, rlen); - - if (!srxd) { - CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(libnal, private, cookie, PTL_FAIL); - return(PTL_FAIL); - } - - buffer = srxd->buffer; - msghdr = (gmnal_msghdr_t*)buffer; - buffer += sizeof(gmnal_msghdr_t); - buffer += sizeof(ptl_hdr_t); - - /* - * Store the senders stxd address in the srxd for this message - * The gmnal_large_message_ack needs it to notify the sender - * the pull of data is complete - */ - srxd->source_stxd = msghdr->stxd; - - /* - * Register the receivers memory - * get the data, - * tell the sender that we got the data - * then tell the receiver we got the data - * TO DO - * If the iovecs match, could interleave - * gm_registers and gm_gets for each element - */ - while (offset >= riov->iov_len) { - offset -= riov->iov_len; - riov++; - nriov--; - } - LASSERT (nriov >= 0); - LASSERT (offset >= 0); - /* - * do this so the final gm_get callback can deregister the memory - */ - PORTAL_ALLOC(srxd->riov, nriov*(sizeof(struct iovec))); - - srxd->riov[0].iov_base = riov->iov_base + offset; - srxd->riov[0].iov_len = riov->iov_len - offset; - if (nriov > 1) - gm_bcopy(&riov[1], &srxd->riov[1], (nriov-1)*(sizeof(struct iovec))); - srxd->nriov = nriov; - - riov = srxd->riov; - nriov_dup = nriov; - riov_dup = riov; - while(nriov--) { - CDEBUG(D_INFO, "Registering memory [%p] len ["LPSZ"] \n", - riov->iov_base, riov->iov_len); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_register_memory(nal_data->gm_port, - riov->iov_base, riov->iov_len); - if (gm_status != GM_SUCCESS) { - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " - "for memory [%p] len ["LPSZ"]\n", - gm_status, gmnal_gm_error(gm_status), - riov->iov_base, riov->iov_len); - GMNAL_GM_LOCK(nal_data); - while (riov_dup != riov) { - gm_deregister_memory(nal_data->gm_port, - riov_dup->iov_base, - riov_dup->iov_len); - riov_dup++; - } - GMNAL_GM_LOCK(nal_data); - /* - * give back srxd and buffer. Send NACK to sender - */ - PORTAL_FREE(srxd->riov, nriov_dup*(sizeof(struct iovec))); - return(PTL_FAIL); - } - GMNAL_GM_UNLOCK(nal_data); - riov++; - } - - /* - * now do gm_get to get the data - */ - srxd->cookie = cookie; - if (gmnal_remote_get(srxd, srxd->nsiov, (struct iovec*)buffer, - nriov_dup, riov_dup) != GMNAL_STATUS_OK) { - CDEBUG(D_ERROR, "can't get the data"); - } - - CDEBUG(D_INFO, "lgmanl_large_rx done\n"); - - return(PTL_OK); -} - - -/* - * Perform a number of remote gets as part of receiving - * a large message. - * The final one to complete (i.e. the last callback to get called) - * tidies up. - * gm_get requires a send token. - */ -int -gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov, - int nriov, struct iovec *riov) -{ - - int ncalls = 0; - - CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p], " - "nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov); - - - ncalls = gmnal_copyiov(0, srxd, nsiov, siov, nriov, riov); - if (ncalls < 0) { - CDEBUG(D_ERROR, "there's something wrong with the iovecs\n"); - return(GMNAL_STATUS_FAIL); - } - CDEBUG(D_INFO, "gmnal_remote_get ncalls [%d]\n", ncalls); - spin_lock_init(&srxd->callback_lock); - srxd->ncallbacks = ncalls; - srxd->callback_status = 0; - - ncalls = gmnal_copyiov(1, srxd, nsiov, siov, nriov, riov); - if (ncalls < 0) { - CDEBUG(D_ERROR, "there's something wrong with the iovecs\n"); - return(GMNAL_STATUS_FAIL); - } - - return(GMNAL_STATUS_OK); - -} - - -/* - * pull data from source node (source iovec) to a local iovec. - * The iovecs may not match which adds the complications below. - * Count the number of gm_gets that will be required to the callbacks - * can determine who is the last one. - */ -int -gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov, - struct iovec *siov, int nriov, struct iovec *riov) -{ - - int ncalls = 0; - int slen = siov->iov_len, rlen = riov->iov_len; - char *sbuf = siov->iov_base, *rbuf = riov->iov_base; - unsigned long sbuf_long; - gm_remote_ptr_t remote_ptr = 0; - unsigned int source_node; - gmnal_ltxd_t *ltxd = NULL; - gmnal_data_t *nal_data = srxd->nal_data; - - CDEBUG(D_TRACE, "copy[%d] nal_data[%p]\n", do_copy, nal_data); - if (do_copy) { - if (!nal_data) { - CDEBUG(D_ERROR, "Bad args No nal_data\n"); - return(GMNAL_STATUS_FAIL); - } - GMNAL_GM_LOCK(nal_data); - if (gm_global_id_to_node_id(nal_data->gm_port, - srxd->gm_source_node, - &source_node) != GM_SUCCESS) { - - CDEBUG(D_ERROR, "cannot resolve global_id [%u] " - "to local node_id\n", srxd->gm_source_node); - GMNAL_GM_UNLOCK(nal_data); - return(GMNAL_STATUS_FAIL); - } - GMNAL_GM_UNLOCK(nal_data); - /* - * We need a send token to use gm_get - * getting an stxd gets us a send token. - * the stxd is used as the context to the - * callback function (so stxd can be returned). - * Set pointer in stxd to srxd so callback count in srxd - * can be decremented to find last callback to complete - */ - CDEBUG(D_INFO, "gmnal_copyiov source node is G[%u]L[%d]\n", - srxd->gm_source_node, source_node); - } - - do { - CDEBUG(D_INFO, "sbuf[%p] slen[%d] rbuf[%p], rlen[%d]\n", - sbuf, slen, rbuf, rlen); - if (slen > rlen) { - ncalls++; - if (do_copy) { - CDEBUG(D_INFO, "slen>rlen\n"); - ltxd = gmnal_get_ltxd(nal_data); - ltxd->srxd = srxd; - GMNAL_GM_LOCK(nal_data); - /* - * funny business to get rid - * of compiler warning - */ - sbuf_long = (unsigned long) sbuf; - remote_ptr = (gm_remote_ptr_t)sbuf_long; - gm_get(nal_data->gm_port, remote_ptr, rbuf, - rlen, GM_LOW_PRIORITY, source_node, - GMNAL_GM_PORT, - gmnal_remote_get_callback, ltxd); - GMNAL_GM_UNLOCK(nal_data); - } - /* - * at the end of 1 iov element - */ - sbuf+=rlen; - slen-=rlen; - riov++; - nriov--; - rbuf = riov->iov_base; - rlen = riov->iov_len; - } else if (rlen > slen) { - ncalls++; - if (do_copy) { - CDEBUG(D_INFO, "slensrxd = srxd; - GMNAL_GM_LOCK(nal_data); - sbuf_long = (unsigned long) sbuf; - remote_ptr = (gm_remote_ptr_t)sbuf_long; - gm_get(nal_data->gm_port, remote_ptr, rbuf, - slen, GM_LOW_PRIORITY, source_node, - GMNAL_GM_PORT, - gmnal_remote_get_callback, ltxd); - GMNAL_GM_UNLOCK(nal_data); - } - /* - * at end of siov element - */ - rbuf+=slen; - rlen-=slen; - siov++; - sbuf = siov->iov_base; - slen = siov->iov_len; - } else { - ncalls++; - if (do_copy) { - CDEBUG(D_INFO, "rlen=slen\n"); - ltxd = gmnal_get_ltxd(nal_data); - ltxd->srxd = srxd; - GMNAL_GM_LOCK(nal_data); - sbuf_long = (unsigned long) sbuf; - remote_ptr = (gm_remote_ptr_t)sbuf_long; - gm_get(nal_data->gm_port, remote_ptr, rbuf, - rlen, GM_LOW_PRIORITY, source_node, - GMNAL_GM_PORT, - gmnal_remote_get_callback, ltxd); - GMNAL_GM_UNLOCK(nal_data); - } - /* - * at end of siov and riov element - */ - siov++; - sbuf = siov->iov_base; - slen = siov->iov_len; - riov++; - nriov--; - rbuf = riov->iov_base; - rlen = riov->iov_len; - } - - } while (nriov); - return(ncalls); -} - - -/* - * The callback function that is invoked after each gm_get call completes. - * Multiple callbacks may be invoked for 1 transaction, only the final - * callback has work to do. - */ -void -gmnal_remote_get_callback(gm_port_t *gm_port, void *context, - gm_status_t status) -{ - - gmnal_ltxd_t *ltxd = (gmnal_ltxd_t*)context; - gmnal_srxd_t *srxd = ltxd->srxd; - lib_nal_t *libnal = srxd->nal_data->libnal; - int lastone; - struct iovec *riov; - int nriov; - gmnal_data_t *nal_data; - - CDEBUG(D_TRACE, "called for context [%p]\n", context); - - if (status != GM_SUCCESS) { - CDEBUG(D_ERROR, "reports error [%d][%s]\n", status, - gmnal_gm_error(status)); - } - - spin_lock(&srxd->callback_lock); - srxd->ncallbacks--; - srxd->callback_status |= status; - lastone = srxd->ncallbacks?0:1; - spin_unlock(&srxd->callback_lock); - nal_data = srxd->nal_data; - - /* - * everyone returns a send token - */ - gmnal_return_ltxd(nal_data, ltxd); - - if (!lastone) { - CDEBUG(D_ERROR, "NOT final callback context[%p]\n", srxd); - return; - } - - /* - * Let our client application proceed - */ - CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - lib_finalize(libnal, srxd, srxd->cookie, PTL_OK); - - /* - * send an ack to the sender to let him know we got the data - */ - gmnal_large_tx_ack(nal_data, srxd); - - /* - * Unregister the memory that was used - * This is a very slow business (slower then register) - */ - nriov = srxd->nriov; - riov = srxd->riov; - GMNAL_GM_LOCK(nal_data); - while (nriov--) { - CDEBUG(D_ERROR, "deregister memory [%p]\n", riov->iov_base); - if (gm_deregister_memory(srxd->nal_data->gm_port, - riov->iov_base, riov->iov_len)) { - CDEBUG(D_ERROR, "failed to deregister memory [%p]\n", - riov->iov_base); - } - riov++; - } - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(srxd->riov, sizeof(struct iovec)*nriov); - - /* - * repost the receive buffer (return receive token) - */ - GMNAL_GM_LOCK(nal_data); - gm_provide_receive_buffer_with_tag(nal_data->gm_port, srxd->buffer, - srxd->gmsize, GM_LOW_PRIORITY, 0); - GMNAL_GM_UNLOCK(nal_data); - - return; -} - - -/* - * Called on target node. - * After pulling data from a source node - * send an ack message to indicate the large transmit is complete. - */ -void -gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) -{ - - gmnal_stxd_t *stxd; - gmnal_msghdr_t *msghdr; - void *buffer = NULL; - unsigned int local_nid; - gm_status_t gm_status = GM_SUCCESS; - - CDEBUG(D_TRACE, "srxd[%p] target_node [%u]\n", srxd, - srxd->gm_source_node); - - GMNAL_GM_LOCK(nal_data); - gm_status = gm_global_id_to_node_id(nal_data->gm_port, - srxd->gm_source_node, &local_nid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "Failed to obtain local id\n"); - return; - } - CDEBUG(D_INFO, "Local Node_id is [%u][%x]\n", local_nid, local_nid); - - stxd = gmnal_get_stxd(nal_data, 1); - CDEBUG(D_TRACE, "gmnal_large_tx_ack got stxd[%p]\n", stxd); - - stxd->nal_data = nal_data; - stxd->type = GMNAL_LARGE_MESSAGE_ACK; - - /* - * Copy gmnal_msg_hdr and portals header to the transmit buffer - * Then copy the data in - */ - buffer = stxd->buffer; - msghdr = (gmnal_msghdr_t*)buffer; - - /* - * Add in the address of the original stxd from the sender node - * so it knows which thread to notify. - */ - msghdr->magic = GMNAL_MAGIC; - msghdr->type = GMNAL_LARGE_MESSAGE_ACK; - msghdr->sender_node_id = nal_data->gm_global_nid; - msghdr->stxd = srxd->source_stxd; - CDEBUG(D_INFO, "processing msghdr at [%p]\n", buffer); - - CDEBUG(D_INFO, "sending\n"); - stxd->msg_size= sizeof(gmnal_msghdr_t); - - - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " - "gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] " - "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, - stxd->msg_size, srxd->gm_source_node, local_nid, stxd); - GMNAL_GM_LOCK(nal_data); - stxd->gm_priority = GM_LOW_PRIORITY; - stxd->gm_target_node = local_nid; - gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, - stxd->gm_size, stxd->msg_size, - GM_LOW_PRIORITY, local_nid, - gmnal_large_tx_ack_callback, - (void*)stxd); - - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "gmnal_large_tx_ack :: done\n"); - - return; -} - - -/* - * A callback to indicate the small transmit operation is compete - * Check for errors and try to deal with them. - * Call lib_finalise to inform the client application that the - * send is complete and the memory can be reused. - * Return the stxd when finished with it (returns a send token) - */ -void -gmnal_large_tx_ack_callback(gm_port_t *gm_port, void *context, - gm_status_t status) -{ - gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; - gmnal_data_t *nal_data = (gmnal_data_t*)stxd->nal_data; - - if (!stxd) { - CDEBUG(D_ERROR, "send completion event for unknown stxd\n"); - return; - } - CDEBUG(D_TRACE, "send completion event for stxd [%p] status is [%d]\n", - stxd, status); - gmnal_return_stxd(stxd->nal_data, stxd); - - GMNAL_GM_UNLOCK(nal_data); - return; -} - -/* - * Indicates the large transmit operation is compete. - * Called on transmit side (means data has been pulled by receiver - * or failed). - * Call lib_finalise to inform the client application that the send - * is complete, deregister the memory and return the stxd. - * Finally, report the rx buffer that the ack message was delivered in. - */ -void -gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) -{ - lib_nal_t *libnal = nal_data->libnal; - gmnal_stxd_t *stxd = NULL; - gmnal_msghdr_t *msghdr = NULL; - void *buffer = NULL; - struct iovec *iov; - - - CDEBUG(D_TRACE, "gmnal_large_tx_ack_received buffer [%p]\n", buffer); - - buffer = srxd->buffer; - msghdr = (gmnal_msghdr_t*)buffer; - stxd = msghdr->stxd; - - CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - - lib_finalize(libnal, stxd, stxd->cookie, PTL_OK); - - /* - * extract the iovec from the stxd, deregister the memory. - * free the space used to store the iovec - */ - iov = stxd->iov; - while(stxd->niov--) { - CDEBUG(D_INFO, "deregister memory [%p] size ["LPSZ"]\n", - iov->iov_base, iov->iov_len); - GMNAL_GM_LOCK(nal_data); - gm_deregister_memory(nal_data->gm_port, iov->iov_base, - iov->iov_len); - GMNAL_GM_UNLOCK(nal_data); - iov++; - } - - /* - * return the send token - * TO DO It is bad to hold onto the send token so long? - */ - gmnal_return_stxd(nal_data, stxd); - - - /* - * requeue the receive buffer - */ - gmnal_rx_requeue_buffer(nal_data, srxd); - - - return; -} diff --git a/lustre/portals/knals/gmnal/gmnal_module.c b/lustre/portals/knals/gmnal/gmnal_module.c deleted file mode 100644 index 3aca90f..0000000 --- a/lustre/portals/knals/gmnal/gmnal_module.c +++ /dev/null @@ -1,134 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "gmnal.h" - - -int gmnal_small_msg_size = 525312; -/* - * -1 indicates default value. - * This is 1 thread per cpu - * See start_kernel_threads - */ -int num_rx_threads = -1; -int num_stxds = 5; -int gm_port = 4; - -int -gmnal_cmd(struct portals_cfg *pcfg, void *private) -{ - gmnal_data_t *nal_data = NULL; - char *name = NULL; - int nid = -2; - int gnid; - gm_status_t gm_status; - - - CDEBUG(D_TRACE, "gmnal_cmd [%d] private [%p]\n", - pcfg->pcfg_command, private); - nal_data = (gmnal_data_t*)private; - switch(pcfg->pcfg_command) { - /* - * just reuse already defined GET_NID. Should define GMNAL version - */ - case(GMNAL_IOC_GET_GNID): - - PORTAL_ALLOC(name, pcfg->pcfg_plen1); - copy_from_user(name, pcfg->pcfg_pbuf1, pcfg->pcfg_plen1); - - GMNAL_GM_LOCK(nal_data); - //nid = gm_host_name_to_node_id(nal_data->gm_port, name); - gm_status = gm_host_name_to_node_id_ex (nal_data->gm_port, 0, name, &nid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_INFO, "gm_host_name_to_node_id_ex(...host %s) failed[%d]\n", - name, gm_status); - return (-1); - } else - CDEBUG(D_INFO, "Local node %s id is [%d]\n", name, nid); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_node_id_to_global_id(nal_data->gm_port, - nid, &gnid); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_INFO, "gm_node_id_to_global_id failed[%d]\n", - gm_status); - return(-1); - } - CDEBUG(D_INFO, "Global node is is [%u][%x]\n", gnid, gnid); - copy_to_user(pcfg->pcfg_pbuf2, &gnid, pcfg->pcfg_plen2); - break; - default: - CDEBUG(D_INFO, "gmnal_cmd UNKNOWN[%d]\n", pcfg->pcfg_command); - pcfg->pcfg_nid2 = -1; - } - - - return(0); -} - - -static int __init -gmnal_load(void) -{ - int status; - CDEBUG(D_TRACE, "This is the gmnal module initialisation routine\n"); - - - CDEBUG(D_INFO, "Calling gmnal_init\n"); - status = gmnal_init(); - if (status == PTL_OK) { - CDEBUG(D_INFO, "Portals GMNAL initialised ok\n"); - } else { - CDEBUG(D_INFO, "Portals GMNAL Failed to initialise\n"); - return(-ENODEV); - - } - - CDEBUG(D_INFO, "This is the end of the gmnal init routine"); - - - return(0); -} - - -static void __exit -gmnal_unload(void) -{ - gmnal_fini(); - return; -} - - -module_init(gmnal_load); - -module_exit(gmnal_unload); - -MODULE_PARM(gmnal_small_msg_size, "i"); -MODULE_PARM(num_rx_threads, "i"); -MODULE_PARM(num_stxds, "i"); -MODULE_PARM(gm_port, "i"); - -MODULE_AUTHOR("Morgan Doyle"); - -MODULE_DESCRIPTION("A Portals kernel NAL for Myrinet GM."); - -MODULE_LICENSE("GPL"); diff --git a/lustre/portals/knals/gmnal/gmnal_utils.c b/lustre/portals/knals/gmnal/gmnal_utils.c deleted file mode 100644 index 6a52319..0000000 --- a/lustre/portals/knals/gmnal/gmnal_utils.c +++ /dev/null @@ -1,1075 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -/* - * All utilities required by lgmanl - */ - -#include "gmnal.h" - -/* - * Am I one of the gmnal rxthreads ? - */ -int -gmnal_is_rxthread(gmnal_data_t *nal_data) -{ - int i; - for (i=0; irxthread_pid[i] == current->pid) - return(1); - } - return(0); -} - - -/* - * Allocate tx descriptors/tokens (large and small) - * allocate a number of small tx buffers and register with GM - * so they are wired and set up for DMA. This is a costly operation. - * Also allocate a corrosponding descriptor to keep track of - * the buffer. - * Put all small descriptors on singly linked list to be available to send - * function. - * Allocate the rest of the available tx tokens for large messages. These will be - * used to do gm_gets in gmnal_copyiov - */ -int -gmnal_alloc_txd(gmnal_data_t *nal_data) -{ - int ntx= 0, nstx= 0, nrxt_stx= 0, - nltx= 0, i = 0; - gmnal_stxd_t *txd = NULL; - gmnal_ltxd_t *ltxd = NULL; - void *txbuffer = NULL; - - CDEBUG(D_TRACE, "gmnal_alloc_small tx\n"); - - GMNAL_GM_LOCK(nal_data); - /* - * total number of transmit tokens - */ - ntx = gm_num_send_tokens(nal_data->gm_port); - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "total number of send tokens available is [%d]\n", ntx); - - /* - * allocate a number for small sends - * num_stxds from gmnal_module.c - */ - nstx = num_stxds; - /* - * give that number plus 1 to the receive threads - */ - nrxt_stx = nstx + 1; - - /* - * give the rest for gm_gets - */ - nltx = ntx - (nrxt_stx + nstx); - if (nltx < 1) { - CDEBUG(D_ERROR, "No tokens available for large messages\n"); - return(GMNAL_STATUS_FAIL); - } - - - /* - * A semaphore is initialised with the - * number of transmit tokens available. - * To get a stxd, acquire the token semaphore. - * this decrements the available token count - * (if no tokens you block here, someone returning a - * stxd will release the semaphore and wake you) - * When token is obtained acquire the spinlock - * to manipulate the list - */ - GMNAL_TXD_TOKEN_INIT(nal_data, nstx); - GMNAL_TXD_LOCK_INIT(nal_data); - GMNAL_RXT_TXD_TOKEN_INIT(nal_data, nrxt_stx); - GMNAL_RXT_TXD_LOCK_INIT(nal_data); - GMNAL_LTXD_TOKEN_INIT(nal_data, nltx); - GMNAL_LTXD_LOCK_INIT(nal_data); - - for (i=0; i<=nstx; i++) { - PORTAL_ALLOC(txd, sizeof(gmnal_stxd_t)); - if (!txd) { - CDEBUG(D_ERROR, "Failed to malloc txd [%d]\n", i); - return(GMNAL_STATUS_NOMEM); - } - GMNAL_GM_LOCK(nal_data); - txbuffer = gm_dma_malloc(nal_data->gm_port, - GMNAL_SMALL_MSG_SIZE(nal_data)); - GMNAL_GM_UNLOCK(nal_data); - if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); - PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); - return(GMNAL_STATUS_FAIL); - } - txd->buffer = txbuffer; - txd->buffer_size = GMNAL_SMALL_MSG_SIZE(nal_data); - txd->gm_size = gm_min_size_for_length(txd->buffer_size); - txd->nal_data = (struct _gmnal_data_t*)nal_data; - txd->rxt = 0; - - txd->next = nal_data->stxd; - nal_data->stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " - "size [%d]\n", txd, txd->buffer, txd->buffer_size); - } - - for (i=0; i<=nrxt_stx; i++) { - PORTAL_ALLOC(txd, sizeof(gmnal_stxd_t)); - if (!txd) { - CDEBUG(D_ERROR, "Failed to malloc txd [%d]\n", i); - return(GMNAL_STATUS_NOMEM); - } - GMNAL_GM_LOCK(nal_data); - txbuffer = gm_dma_malloc(nal_data->gm_port, - GMNAL_SMALL_MSG_SIZE(nal_data)); - GMNAL_GM_UNLOCK(nal_data); - if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); - PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); - return(GMNAL_STATUS_FAIL); - } - txd->buffer = txbuffer; - txd->buffer_size = GMNAL_SMALL_MSG_SIZE(nal_data); - txd->gm_size = gm_min_size_for_length(txd->buffer_size); - txd->nal_data = (struct _gmnal_data_t*)nal_data; - txd->rxt = 1; - - txd->next = nal_data->rxt_stxd; - nal_data->rxt_stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " - "size [%d]\n", txd, txd->buffer, txd->buffer_size); - } - - /* - * string together large tokens - */ - for (i=0; i<=nltx ; i++) { - PORTAL_ALLOC(ltxd, sizeof(gmnal_ltxd_t)); - ltxd->next = nal_data->ltxd; - nal_data->ltxd = ltxd; - } - return(GMNAL_STATUS_OK); -} - -/* Free the list of wired and gm_registered small tx buffers and - * the tx descriptors that go along with them. - */ -void -gmnal_free_txd(gmnal_data_t *nal_data) -{ - gmnal_stxd_t *txd = nal_data->stxd, *_txd = NULL; - gmnal_ltxd_t *ltxd = NULL, *_ltxd = NULL; - - CDEBUG(D_TRACE, "gmnal_free_small tx\n"); - - while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " - "size [%d]\n", txd, txd->buffer, txd->buffer_size); - _txd = txd; - txd = txd->next; - GMNAL_GM_LOCK(nal_data); - gm_dma_free(nal_data->gm_port, _txd->buffer); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(_txd, sizeof(gmnal_stxd_t)); - } - txd = nal_data->rxt_stxd; - while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " - "size [%d]\n", txd, txd->buffer, txd->buffer_size); - _txd = txd; - txd = txd->next; - GMNAL_GM_LOCK(nal_data); - gm_dma_free(nal_data->gm_port, _txd->buffer); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(_txd, sizeof(gmnal_stxd_t)); - } - ltxd = nal_data->ltxd; - while(txd) { - _ltxd = ltxd; - ltxd = ltxd->next; - PORTAL_FREE(_ltxd, sizeof(gmnal_ltxd_t)); - } - - return; -} - - -/* - * Get a txd from the list - * This get us a wired and gm_registered small tx buffer. - * This implicitly gets us a send token also. - */ -gmnal_stxd_t * -gmnal_get_stxd(gmnal_data_t *nal_data, int block) -{ - - gmnal_stxd_t *txd = NULL; - pid_t pid = current->pid; - - - CDEBUG(D_TRACE, "gmnal_get_stxd nal_data [%p] block[%d] pid [%d]\n", - nal_data, block, pid); - - if (gmnal_is_rxthread(nal_data)) { - CDEBUG(D_INFO, "RXTHREAD Attempting to get token\n"); - GMNAL_RXT_TXD_GETTOKEN(nal_data); - GMNAL_RXT_TXD_LOCK(nal_data); - txd = nal_data->rxt_stxd; - nal_data->rxt_stxd = txd->next; - GMNAL_RXT_TXD_UNLOCK(nal_data); - CDEBUG(D_INFO, "RXTHREAD got [%p], head is [%p]\n", - txd, nal_data->rxt_stxd); - txd->kniov = 0; - txd->rxt = 1; - } else { - if (block) { - CDEBUG(D_INFO, "Attempting to get token\n"); - GMNAL_TXD_GETTOKEN(nal_data); - CDEBUG(D_PORTALS, "Got token\n"); - } else { - if (GMNAL_TXD_TRYGETTOKEN(nal_data)) { - CDEBUG(D_ERROR, "can't get token\n"); - return(NULL); - } - } - GMNAL_TXD_LOCK(nal_data); - txd = nal_data->stxd; - nal_data->stxd = txd->next; - GMNAL_TXD_UNLOCK(nal_data); - CDEBUG(D_INFO, "got [%p], head is [%p]\n", txd, - nal_data->stxd); - txd->kniov = 0; - } /* general txd get */ - return(txd); -} - -/* - * Return a txd to the list - */ -void -gmnal_return_stxd(gmnal_data_t *nal_data, gmnal_stxd_t *txd) -{ - CDEBUG(D_TRACE, "nal_data [%p], txd[%p] rxt[%d]\n", nal_data, - txd, txd->rxt); - - /* - * this transmit descriptor is - * for the rxthread - */ - if (txd->rxt) { - GMNAL_RXT_TXD_LOCK(nal_data); - txd->next = nal_data->rxt_stxd; - nal_data->rxt_stxd = txd; - GMNAL_RXT_TXD_UNLOCK(nal_data); - GMNAL_RXT_TXD_RETURNTOKEN(nal_data); - CDEBUG(D_INFO, "Returned stxd to rxthread list\n"); - } else { - GMNAL_TXD_LOCK(nal_data); - txd->next = nal_data->stxd; - nal_data->stxd = txd; - GMNAL_TXD_UNLOCK(nal_data); - GMNAL_TXD_RETURNTOKEN(nal_data); - CDEBUG(D_INFO, "Returned stxd to general list\n"); - } - return; -} - - -/* - * Get a large transmit descriptor from the free list - * This implicitly gets us a transmit token . - * always wait for one. - */ -gmnal_ltxd_t * -gmnal_get_ltxd(gmnal_data_t *nal_data) -{ - - gmnal_ltxd_t *ltxd = NULL; - - CDEBUG(D_TRACE, "nal_data [%p]\n", nal_data); - - GMNAL_LTXD_GETTOKEN(nal_data); - GMNAL_LTXD_LOCK(nal_data); - ltxd = nal_data->ltxd; - nal_data->ltxd = ltxd->next; - GMNAL_LTXD_UNLOCK(nal_data); - CDEBUG(D_INFO, "got [%p], head is [%p]\n", ltxd, nal_data->ltxd); - return(ltxd); -} - -/* - * Return an ltxd to the list - */ -void -gmnal_return_ltxd(gmnal_data_t *nal_data, gmnal_ltxd_t *ltxd) -{ - CDEBUG(D_TRACE, "nal_data [%p], ltxd[%p]\n", nal_data, ltxd); - - GMNAL_LTXD_LOCK(nal_data); - ltxd->next = nal_data->ltxd; - nal_data->ltxd = ltxd; - GMNAL_LTXD_UNLOCK(nal_data); - GMNAL_LTXD_RETURNTOKEN(nal_data); - return; -} -/* - * allocate a number of small rx buffers and register with GM - * so they are wired and set up for DMA. This is a costly operation. - * Also allocate a corrosponding descriptor to keep track of - * the buffer. - * Put all descriptors on singly linked list to be available to - * receive thread. - */ -int -gmnal_alloc_srxd(gmnal_data_t *nal_data) -{ - int nrx = 0, nsrx = 0, i = 0; - gmnal_srxd_t *rxd = NULL; - void *rxbuffer = NULL; - - CDEBUG(D_TRACE, "gmnal_alloc_small rx\n"); - - GMNAL_GM_LOCK(nal_data); - nrx = gm_num_receive_tokens(nal_data->gm_port); - GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_INFO, "total number of receive tokens available is [%d]\n", - nrx); - - nsrx = nrx/2; - nsrx = 12; - /* - * make the number of rxds twice our total - * number of stxds plus 1 - */ - nsrx = num_stxds*2 + 2; - - CDEBUG(D_INFO, "Allocated [%d] receive tokens to small messages\n", - nsrx); - - - GMNAL_GM_LOCK(nal_data); - nal_data->srxd_hash = gm_create_hash(gm_hash_compare_ptrs, - gm_hash_hash_ptr, 0, 0, nsrx, 0); - GMNAL_GM_UNLOCK(nal_data); - if (!nal_data->srxd_hash) { - CDEBUG(D_ERROR, "Failed to create hash table\n"); - return(GMNAL_STATUS_NOMEM); - } - - GMNAL_RXD_TOKEN_INIT(nal_data, nsrx); - GMNAL_RXD_LOCK_INIT(nal_data); - - for (i=0; i<=nsrx; i++) { - PORTAL_ALLOC(rxd, sizeof(gmnal_srxd_t)); - if (!rxd) { - CDEBUG(D_ERROR, "Failed to malloc rxd [%d]\n", i); - return(GMNAL_STATUS_NOMEM); - } -#if 0 - PORTAL_ALLOC(rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); - if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], " - "size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); - PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); - return(GMNAL_STATUS_FAIL); - } - CDEBUG(D_NET, "Calling gm_register_memory with port [%p] " - "rxbuffer [%p], size [%d]\n", nal_data->gm_port, - rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); - GMNAL_GM_LOCK(nal_data); - gm_status = gm_register_memory(nal_data->gm_port, rxbuffer, - GMNAL_SMALL_MSG_SIZE(nal_data)); - GMNAL_GM_UNLOCK(nal_data); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p]," - " index [%d]\n", rxbuffer, i); - switch(gm_status) { - case(GM_FAILURE): - CDEBUG(D_ERROR, "GM_FAILURE\n"); - break; - case(GM_PERMISSION_DENIED): - CDEBUG(D_ERROR, "PERMISSION_DENIED\n"); - break; - case(GM_INVALID_PARAMETER): - CDEBUG(D_ERROR, "INVALID_PARAMETER\n"); - break; - default: - CDEBUG(D_ERROR, "Unknown error[%d]\n", - gm_status); - break; - - } - return(GMNAL_STATUS_FAIL); - } -#else - GMNAL_GM_LOCK(nal_data); - rxbuffer = gm_dma_malloc(nal_data->gm_port, - GMNAL_SMALL_MSG_SIZE(nal_data)); - GMNAL_GM_UNLOCK(nal_data); - if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d]," - " size [%d]\n", i, - GMNAL_SMALL_MSG_SIZE(nal_data)); - PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); - return(GMNAL_STATUS_FAIL); - } -#endif - - rxd->buffer = rxbuffer; - rxd->size = GMNAL_SMALL_MSG_SIZE(nal_data); - rxd->gmsize = gm_min_size_for_length(rxd->size); - - if (gm_hash_insert(nal_data->srxd_hash, - (void*)rxbuffer, (void*)rxd)) { - - CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] " - "for rxbuffer[%p]\n", rxd, rxbuffer); - return(GMNAL_STATUS_FAIL); - } - - rxd->next = nal_data->srxd; - nal_data->srxd = rxd; - CDEBUG(D_INFO, "Registered rxd [%p] with buffer [%p], " - "size [%d]\n", rxd, rxd->buffer, rxd->size); - } - - return(GMNAL_STATUS_OK); -} - - - -/* Free the list of wired and gm_registered small rx buffers and the - * rx descriptors that go along with them. - */ -void -gmnal_free_srxd(gmnal_data_t *nal_data) -{ - gmnal_srxd_t *rxd = nal_data->srxd, *_rxd = NULL; - - CDEBUG(D_TRACE, "gmnal_free_small rx\n"); - - while(rxd) { - CDEBUG(D_INFO, "Freeing rxd [%p] buffer [%p], size [%d]\n", - rxd, rxd->buffer, rxd->size); - _rxd = rxd; - rxd = rxd->next; - -#if 0 - GMNAL_GM_LOCK(nal_data); - gm_deregister_memory(nal_data->gm_port, _rxd->buffer, - _rxd->size); - GMNAL_GM_UNLOCK(nal_data); - PORTAL_FREE(_rxd->buffer, GMNAL_SMALL_RXBUFFER_SIZE); -#else - GMNAL_GM_LOCK(nal_data); - gm_dma_free(nal_data->gm_port, _rxd->buffer); - GMNAL_GM_UNLOCK(nal_data); -#endif - PORTAL_FREE(_rxd, sizeof(gmnal_srxd_t)); - } - return; -} - - -/* - * Get a rxd from the free list - * This get us a wired and gm_registered small rx buffer. - * This implicitly gets us a receive token also. - */ -gmnal_srxd_t * -gmnal_get_srxd(gmnal_data_t *nal_data, int block) -{ - - gmnal_srxd_t *rxd = NULL; - CDEBUG(D_TRACE, "nal_data [%p] block [%d]\n", nal_data, block); - - if (block) { - GMNAL_RXD_GETTOKEN(nal_data); - } else { - if (GMNAL_RXD_TRYGETTOKEN(nal_data)) { - CDEBUG(D_INFO, "gmnal_get_srxd Can't get token\n"); - return(NULL); - } - } - GMNAL_RXD_LOCK(nal_data); - rxd = nal_data->srxd; - if (rxd) - nal_data->srxd = rxd->next; - GMNAL_RXD_UNLOCK(nal_data); - CDEBUG(D_INFO, "got [%p], head is [%p]\n", rxd, nal_data->srxd); - return(rxd); -} - -/* - * Return an rxd to the list - */ -void -gmnal_return_srxd(gmnal_data_t *nal_data, gmnal_srxd_t *rxd) -{ - CDEBUG(D_TRACE, "nal_data [%p], rxd[%p]\n", nal_data, rxd); - - GMNAL_RXD_LOCK(nal_data); - rxd->next = nal_data->srxd; - nal_data->srxd = rxd; - GMNAL_RXD_UNLOCK(nal_data); - GMNAL_RXD_RETURNTOKEN(nal_data); - return; -} - -/* - * Given a pointer to a srxd find - * the relevant descriptor for it - * This is done by searching a hash - * list that is created when the srxd's - * are created - */ -gmnal_srxd_t * -gmnal_rxbuffer_to_srxd(gmnal_data_t *nal_data, void *rxbuffer) -{ - gmnal_srxd_t *srxd = NULL; - CDEBUG(D_TRACE, "nal_data [%p], rxbuffer [%p]\n", nal_data, rxbuffer); - srxd = gm_hash_find(nal_data->srxd_hash, rxbuffer); - CDEBUG(D_INFO, "srxd is [%p]\n", srxd); - return(srxd); -} - - -void -gmnal_stop_rxthread(gmnal_data_t *nal_data) -{ - int delay = 30; - - - - CDEBUG(D_TRACE, "Attempting to stop rxthread nal_data [%p]\n", - nal_data); - - nal_data->rxthread_stop_flag = GMNAL_THREAD_STOP; - - gmnal_remove_rxtwe(nal_data); - /* - * kick the thread - */ - up(&nal_data->rxtwe_wait); - - while(nal_data->rxthread_flag != GMNAL_THREAD_RESET && delay--) { - CDEBUG(D_INFO, "gmnal_stop_rxthread sleeping\n"); - gmnal_yield(1); - up(&nal_data->rxtwe_wait); - } - - if (nal_data->rxthread_flag != GMNAL_THREAD_RESET) { - CDEBUG(D_ERROR, "I don't know how to wake the thread\n"); - } else { - CDEBUG(D_INFO, "rx thread seems to have stopped\n"); - } -} - -void -gmnal_stop_ctthread(gmnal_data_t *nal_data) -{ - int delay = 15; - - - - CDEBUG(D_TRACE, "Attempting to stop ctthread nal_data [%p]\n", - nal_data); - - nal_data->ctthread_flag = GMNAL_THREAD_STOP; - GMNAL_GM_LOCK(nal_data); - gm_set_alarm(nal_data->gm_port, &nal_data->ctthread_alarm, 10, - NULL, NULL); - GMNAL_GM_UNLOCK(nal_data); - - while(nal_data->ctthread_flag == GMNAL_THREAD_STOP && delay--) { - CDEBUG(D_INFO, "gmnal_stop_ctthread sleeping\n"); - gmnal_yield(1); - } - - if (nal_data->ctthread_flag == GMNAL_THREAD_STOP) { - CDEBUG(D_ERROR, "I DON'T KNOW HOW TO WAKE THE THREAD\n"); - } else { - CDEBUG(D_INFO, "CT THREAD SEEMS TO HAVE STOPPED\n"); - } -} - - - -char * -gmnal_gm_error(gm_status_t status) -{ - return(gm_strerror(status)); - - switch(status) { - case(GM_SUCCESS): - return("SUCCESS"); - case(GM_FAILURE): - return("FAILURE"); - case(GM_INPUT_BUFFER_TOO_SMALL): - return("INPUT_BUFFER_TOO_SMALL"); - case(GM_OUTPUT_BUFFER_TOO_SMALL): - return("OUTPUT_BUFFER_TOO_SMALL"); - case(GM_TRY_AGAIN ): - return("TRY_AGAIN"); - case(GM_BUSY): - return("BUSY"); - case(GM_MEMORY_FAULT): - return("MEMORY_FAULT"); - case(GM_INTERRUPTED): - return("INTERRUPTED"); - case(GM_INVALID_PARAMETER): - return("INVALID_PARAMETER"); - case(GM_OUT_OF_MEMORY): - return("OUT_OF_MEMORY"); - case(GM_INVALID_COMMAND): - return("INVALID_COMMAND"); - case(GM_PERMISSION_DENIED): - return("PERMISSION_DENIED"); - case(GM_INTERNAL_ERROR): - return("INTERNAL_ERROR"); - case(GM_UNATTACHED): - return("UNATTACHED"); - case(GM_UNSUPPORTED_DEVICE): - return("UNSUPPORTED_DEVICE"); - case(GM_SEND_TIMED_OUT): - return("GM_SEND_TIMEDOUT"); - case(GM_SEND_REJECTED): - return("GM_SEND_REJECTED"); - case(GM_SEND_TARGET_PORT_CLOSED): - return("GM_SEND_TARGET_PORT_CLOSED"); - case(GM_SEND_TARGET_NODE_UNREACHABLE): - return("GM_SEND_TARGET_NODE_UNREACHABLE"); - case(GM_SEND_DROPPED): - return("GM_SEND_DROPPED"); - case(GM_SEND_PORT_CLOSED): - return("GM_SEND_PORT_CLOSED"); - case(GM_NODE_ID_NOT_YET_SET): - return("GM_NODE_ID_NOT_YET_SET"); - case(GM_STILL_SHUTTING_DOWN): - return("GM_STILL_SHUTTING_DOWN"); - case(GM_CLONE_BUSY): - return("GM_CLONE_BUSY"); - case(GM_NO_SUCH_DEVICE): - return("GM_NO_SUCH_DEVICE"); - case(GM_ABORTED): - return("GM_ABORTED"); - case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - return("GM_INCOMPATIBLE_LIB_AND_DRIVER"); - case(GM_UNTRANSLATED_SYSTEM_ERROR): - return("GM_UNTRANSLATED_SYSTEM_ERROR"); - case(GM_ACCESS_DENIED): - return("GM_ACCESS_DENIED"); - - -/* - * These ones are in the docs but aren't in the header file - case(GM_DEV_NOT_FOUND): - return("GM_DEV_NOT_FOUND"); - case(GM_INVALID_PORT_NUMBER): - return("GM_INVALID_PORT_NUMBER"); - case(GM_UC_ERROR): - return("GM_US_ERROR"); - case(GM_PAGE_TABLE_FULL): - return("GM_PAGE_TABLE_FULL"); - case(GM_MINOR_OVERFLOW): - return("GM_MINOR_OVERFLOW"); - case(GM_SEND_ORPHANED): - return("GM_SEND_ORPHANED"); - case(GM_HARDWARE_FAULT): - return("GM_HARDWARE_FAULT"); - case(GM_DATA_CORRUPTED): - return("GM_DATA_CORRUPTED"); - case(GM_TIMED_OUT): - return("GM_TIMED_OUT"); - case(GM_USER_ERROR): - return("GM_USER_ERROR"); - case(GM_NO_MATCH): - return("GM_NOMATCH"); - case(GM_NOT_SUPPORTED_IN_KERNEL): - return("GM_NOT_SUPPORTED_IN_KERNEL"); - case(GM_NOT_SUPPORTED_ON_ARCH): - return("GM_NOT_SUPPORTED_ON_ARCH"); - case(GM_PTE_REF_CNT_OVERFLOW): - return("GM_PTR_REF_CNT_OVERFLOW"); - case(GM_NO_DRIVER_SUPPORT): - return("GM_NO_DRIVER_SUPPORT"); - case(GM_FIRMWARE_NOT_RUNNING): - return("GM_FIRMWARE_NOT_RUNNING"); - - * These ones are in the docs but aren't in the header file - */ - default: - return("UNKNOWN GM ERROR CODE"); - } -} - - -char * -gmnal_rxevent(gm_recv_event_t *ev) -{ - short event; - event = GM_RECV_EVENT_TYPE(ev); - switch(event) { - case(GM_NO_RECV_EVENT): - return("GM_NO_RECV_EVENT"); - case(GM_SENDS_FAILED_EVENT): - return("GM_SEND_FAILED_EVENT"); - case(GM_ALARM_EVENT): - return("GM_ALARM_EVENT"); - case(GM_SENT_EVENT): - return("GM_SENT_EVENT"); - case(_GM_SLEEP_EVENT): - return("_GM_SLEEP_EVENT"); - case(GM_RAW_RECV_EVENT): - return("GM_RAW_RECV_EVENT"); - case(GM_BAD_SEND_DETECTED_EVENT): - return("GM_BAD_SEND_DETECTED_EVENT"); - case(GM_SEND_TOKEN_VIOLATION_EVENT): - return("GM_SEND_TOKEN_VIOLATION_EVENT"); - case(GM_RECV_TOKEN_VIOLATION_EVENT): - return("GM_RECV_TOKEN_VIOLATION_EVENT"); - case(GM_BAD_RECV_TOKEN_EVENT): - return("GM_BAD_RECV_TOKEN_EVENT"); - case(GM_ALARM_VIOLATION_EVENT): - return("GM_ALARM_VIOLATION_EVENT"); - case(GM_RECV_EVENT): - return("GM_RECV_EVENT"); - case(GM_HIGH_RECV_EVENT): - return("GM_HIGH_RECV_EVENT"); - case(GM_PEER_RECV_EVENT): - return("GM_PEER_RECV_EVENT"); - case(GM_HIGH_PEER_RECV_EVENT): - return("GM_HIGH_PEER_RECV_EVENT"); - case(GM_FAST_RECV_EVENT): - return("GM_FAST_RECV_EVENT"); - case(GM_FAST_HIGH_RECV_EVENT): - return("GM_FAST_HIGH_RECV_EVENT"); - case(GM_FAST_PEER_RECV_EVENT): - return("GM_FAST_PEER_RECV_EVENT"); - case(GM_FAST_HIGH_PEER_RECV_EVENT): - return("GM_FAST_HIGH_PEER_RECV_EVENT"); - case(GM_REJECTED_SEND_EVENT): - return("GM_REJECTED_SEND_EVENT"); - case(GM_ORPHANED_SEND_EVENT): - return("GM_ORPHANED_SEND_EVENT"); - case(GM_BAD_RESEND_DETECTED_EVENT): - return("GM_BAD_RESEND_DETETED_EVENT"); - case(GM_DROPPED_SEND_EVENT): - return("GM_DROPPED_SEND_EVENT"); - case(GM_BAD_SEND_VMA_EVENT): - return("GM_BAD_SEND_VMA_EVENT"); - case(GM_BAD_RECV_VMA_EVENT): - return("GM_BAD_RECV_VMA_EVENT"); - case(_GM_FLUSHED_ALARM_EVENT): - return("GM_FLUSHED_ALARM_EVENT"); - case(GM_SENT_TOKENS_EVENT): - return("GM_SENT_TOKENS_EVENTS"); - case(GM_IGNORE_RECV_EVENT): - return("GM_IGNORE_RECV_EVENT"); - case(GM_ETHERNET_RECV_EVENT): - return("GM_ETHERNET_RECV_EVENT"); - case(GM_NEW_NO_RECV_EVENT): - return("GM_NEW_NO_RECV_EVENT"); - case(GM_NEW_SENDS_FAILED_EVENT): - return("GM_NEW_SENDS_FAILED_EVENT"); - case(GM_NEW_ALARM_EVENT): - return("GM_NEW_ALARM_EVENT"); - case(GM_NEW_SENT_EVENT): - return("GM_NEW_SENT_EVENT"); - case(_GM_NEW_SLEEP_EVENT): - return("GM_NEW_SLEEP_EVENT"); - case(GM_NEW_RAW_RECV_EVENT): - return("GM_NEW_RAW_RECV_EVENT"); - case(GM_NEW_BAD_SEND_DETECTED_EVENT): - return("GM_NEW_BAD_SEND_DETECTED_EVENT"); - case(GM_NEW_SEND_TOKEN_VIOLATION_EVENT): - return("GM_NEW_SEND_TOKEN_VIOLATION_EVENT"); - case(GM_NEW_RECV_TOKEN_VIOLATION_EVENT): - return("GM_NEW_RECV_TOKEN_VIOLATION_EVENT"); - case(GM_NEW_BAD_RECV_TOKEN_EVENT): - return("GM_NEW_BAD_RECV_TOKEN_EVENT"); - case(GM_NEW_ALARM_VIOLATION_EVENT): - return("GM_NEW_ALARM_VIOLATION_EVENT"); - case(GM_NEW_RECV_EVENT): - return("GM_NEW_RECV_EVENT"); - case(GM_NEW_HIGH_RECV_EVENT): - return("GM_NEW_HIGH_RECV_EVENT"); - case(GM_NEW_PEER_RECV_EVENT): - return("GM_NEW_PEER_RECV_EVENT"); - case(GM_NEW_HIGH_PEER_RECV_EVENT): - return("GM_NEW_HIGH_PEER_RECV_EVENT"); - case(GM_NEW_FAST_RECV_EVENT): - return("GM_NEW_FAST_RECV_EVENT"); - case(GM_NEW_FAST_HIGH_RECV_EVENT): - return("GM_NEW_FAST_HIGH_RECV_EVENT"); - case(GM_NEW_FAST_PEER_RECV_EVENT): - return("GM_NEW_FAST_PEER_RECV_EVENT"); - case(GM_NEW_FAST_HIGH_PEER_RECV_EVENT): - return("GM_NEW_FAST_HIGH_PEER_RECV_EVENT"); - case(GM_NEW_REJECTED_SEND_EVENT): - return("GM_NEW_REJECTED_SEND_EVENT"); - case(GM_NEW_ORPHANED_SEND_EVENT): - return("GM_NEW_ORPHANED_SEND_EVENT"); - case(_GM_NEW_PUT_NOTIFICATION_EVENT): - return("_GM_NEW_PUT_NOTIFICATION_EVENT"); - case(GM_NEW_FREE_SEND_TOKEN_EVENT): - return("GM_NEW_FREE_SEND_TOKEN_EVENT"); - case(GM_NEW_FREE_HIGH_SEND_TOKEN_EVENT): - return("GM_NEW_FREE_HIGH_SEND_TOKEN_EVENT"); - case(GM_NEW_BAD_RESEND_DETECTED_EVENT): - return("GM_NEW_BAD_RESEND_DETECTED_EVENT"); - case(GM_NEW_DROPPED_SEND_EVENT): - return("GM_NEW_DROPPED_SEND_EVENT"); - case(GM_NEW_BAD_SEND_VMA_EVENT): - return("GM_NEW_BAD_SEND_VMA_EVENT"); - case(GM_NEW_BAD_RECV_VMA_EVENT): - return("GM_NEW_BAD_RECV_VMA_EVENT"); - case(_GM_NEW_FLUSHED_ALARM_EVENT): - return("GM_NEW_FLUSHED_ALARM_EVENT"); - case(GM_NEW_SENT_TOKENS_EVENT): - return("GM_NEW_SENT_TOKENS_EVENT"); - case(GM_NEW_IGNORE_RECV_EVENT): - return("GM_NEW_IGNORE_RECV_EVENT"); - case(GM_NEW_ETHERNET_RECV_EVENT): - return("GM_NEW_ETHERNET_RECV_EVENT"); - default: - return("Unknown Recv event"); -#if 0 - case(/* _GM_PUT_NOTIFICATION_EVENT */ - case(/* GM_FREE_SEND_TOKEN_EVENT */ - case(/* GM_FREE_HIGH_SEND_TOKEN_EVENT */ -#endif - } -} - - -void -gmnal_yield(int delay) -{ - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(delay); -} - -int -gmnal_is_small_msg(gmnal_data_t *nal_data, int niov, struct iovec *iov, - int len) -{ - - CDEBUG(D_TRACE, "len [%d] limit[%d]\n", len, - GMNAL_SMALL_MSG_SIZE(nal_data)); - - if ((len + sizeof(ptl_hdr_t) + sizeof(gmnal_msghdr_t)) - < GMNAL_SMALL_MSG_SIZE(nal_data)) { - - CDEBUG(D_INFO, "Yep, small message\n"); - return(1); - } else { - CDEBUG(D_ERROR, "No, not small message\n"); - /* - * could be made up of lots of little ones ! - */ - return(0); - } - -} - -/* - * extract info from the receive event. - * Have to do this before the next call to gm_receive - * Deal with all endian stuff here. - * Then stick work entry on list where rxthreads - * can get it to complete the receive - */ -int -gmnal_add_rxtwe(gmnal_data_t *nal_data, gm_recv_t *recv) -{ - gmnal_rxtwe_t *we = NULL; - - CDEBUG(D_NET, "adding entry to list\n"); - - PORTAL_ALLOC(we, sizeof(gmnal_rxtwe_t)); - if (!we) { - CDEBUG(D_ERROR, "failed to malloc\n"); - return(GMNAL_STATUS_FAIL); - } - we->buffer = gm_ntohp(recv->buffer); - we->snode = (int)gm_ntoh_u16(recv->sender_node_id); - we->sport = (int)gm_ntoh_u8(recv->sender_port_id); - we->type = (int)gm_ntoh_u8(recv->type); - we->length = (int)gm_ntohl(recv->length); - - spin_lock(&nal_data->rxtwe_lock); - if (nal_data->rxtwe_tail) { - nal_data->rxtwe_tail->next = we; - } else { - nal_data->rxtwe_head = we; - nal_data->rxtwe_tail = we; - } - nal_data->rxtwe_tail = we; - spin_unlock(&nal_data->rxtwe_lock); - - up(&nal_data->rxtwe_wait); - return(GMNAL_STATUS_OK); -} - -void -gmnal_remove_rxtwe(gmnal_data_t *nal_data) -{ - gmnal_rxtwe_t *_we, *we = nal_data->rxtwe_head; - - CDEBUG(D_NET, "removing all work list entries\n"); - - spin_lock(&nal_data->rxtwe_lock); - CDEBUG(D_NET, "Got lock\n"); - while (we) { - _we = we; - we = we->next; - PORTAL_FREE(_we, sizeof(gmnal_rxtwe_t)); - } - spin_unlock(&nal_data->rxtwe_lock); - nal_data->rxtwe_head = NULL; - nal_data->rxtwe_tail = NULL; -} - -gmnal_rxtwe_t * -gmnal_get_rxtwe(gmnal_data_t *nal_data) -{ - gmnal_rxtwe_t *we = NULL; - - CDEBUG(D_NET, "Getting entry to list\n"); - - do { - down(&nal_data->rxtwe_wait); - if (nal_data->rxthread_stop_flag == GMNAL_THREAD_STOP) { - /* - * time to stop - * TO DO some one free the work entries - */ - return(NULL); - } - spin_lock(&nal_data->rxtwe_lock); - if (nal_data->rxtwe_head) { - CDEBUG(D_INFO, "Got a work entry\n"); - we = nal_data->rxtwe_head; - nal_data->rxtwe_head = we->next; - if (!nal_data->rxtwe_head) - nal_data->rxtwe_tail = NULL; - } else { - CDEBUG(D_WARNING, "woken but no work\n"); - } - spin_unlock(&nal_data->rxtwe_lock); - } while (!we); - - CDEBUG(D_INFO, "Returning we[%p]\n", we); - return(we); -} - - -/* - * Start the caretaker thread and a number of receiver threads - * The caretaker thread gets events from the gm library. - * It passes receive events to the receiver threads via a work list. - * It processes other events itself in gm_unknown. These will be - * callback events or sleeps. - */ -int -gmnal_start_kernel_threads(gmnal_data_t *nal_data) -{ - - int threads = 0; - /* - * the alarm is used to wake the caretaker thread from - * gm_unknown call (sleeping) to exit it. - */ - CDEBUG(D_NET, "Initializing caretaker thread alarm and flag\n"); - gm_initialize_alarm(&nal_data->ctthread_alarm); - nal_data->ctthread_flag = GMNAL_THREAD_RESET; - - - CDEBUG(D_INFO, "Starting caretaker thread\n"); - nal_data->ctthread_pid = - kernel_thread(gmnal_ct_thread, (void*)nal_data, 0); - if (nal_data->ctthread_pid <= 0) { - CDEBUG(D_ERROR, "Caretaker thread failed to start\n"); - return(GMNAL_STATUS_FAIL); - } - - while (nal_data->rxthread_flag != GMNAL_THREAD_RESET) { - gmnal_yield(1); - CDEBUG(D_INFO, "Waiting for caretaker thread signs of life\n"); - } - - CDEBUG(D_INFO, "caretaker thread has started\n"); - - - /* - * Now start a number of receiver threads - * these treads get work to do from the caretaker (ct) thread - */ - nal_data->rxthread_flag = GMNAL_THREAD_RESET; - nal_data->rxthread_stop_flag = GMNAL_THREAD_RESET; - - for (threads=0; threadsrxthread_pid[threads] = -1; - spin_lock_init(&nal_data->rxtwe_lock); - spin_lock_init(&nal_data->rxthread_flag_lock); - sema_init(&nal_data->rxtwe_wait, 0); - nal_data->rxtwe_head = NULL; - nal_data->rxtwe_tail = NULL; - /* - * If the default number of receive threades isn't - * modified at load time, then start one thread per cpu - */ - if (num_rx_threads == -1) - num_rx_threads = smp_num_cpus; - CDEBUG(D_INFO, "Starting [%d] receive threads\n", num_rx_threads); - for (threads=0; threadsrxthread_pid[threads] = - kernel_thread(gmnal_rx_thread, (void*)nal_data, 0); - if (nal_data->rxthread_pid[threads] <= 0) { - CDEBUG(D_ERROR, "Receive thread failed to start\n"); - gmnal_stop_rxthread(nal_data); - gmnal_stop_ctthread(nal_data); - return(GMNAL_STATUS_FAIL); - } - } - - for (;;) { - spin_lock(&nal_data->rxthread_flag_lock); - if (nal_data->rxthread_flag == GMNAL_RXTHREADS_STARTED) { - spin_unlock(&nal_data->rxthread_flag_lock); - break; - } - spin_unlock(&nal_data->rxthread_flag_lock); - gmnal_yield(1); - } - - CDEBUG(D_INFO, "receive threads seem to have started\n"); - - return(GMNAL_STATUS_OK); -} diff --git a/lustre/portals/knals/iibnal/.cvsignore b/lustre/portals/knals/iibnal/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/knals/iibnal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/iibnal/Makefile.in b/lustre/portals/knals/iibnal/Makefile.in deleted file mode 100644 index e7934e2..0000000 --- a/lustre/portals/knals/iibnal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kiibnal -kiibnal-objs := iibnal.o iibnal_cb.o - -EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/iibnal/Makefile.mk b/lustre/portals/knals/iibnal/Makefile.mk deleted file mode 100644 index 0459a20..0000000 --- a/lustre/portals/knals/iibnal/Makefile.mk +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../../Kernelenv - -obj-y += kiibnal.o -kiibnal-objs := iibnal.o iibnal_cb.o - diff --git a/lustre/portals/knals/iibnal/autoMakefile.am b/lustre/portals/knals/iibnal/autoMakefile.am deleted file mode 100644 index 251df66..0000000 --- a/lustre/portals/knals/iibnal/autoMakefile.am +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -if BUILD_IIBNAL -modulenet_DATA = kiibnal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h diff --git a/lustre/portals/knals/iibnal/iibnal.c b/lustre/portals/knals/iibnal/iibnal.c deleted file mode 100644 index 09908c9..0000000 --- a/lustre/portals/knals/iibnal/iibnal.c +++ /dev/null @@ -1,1713 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "iibnal.h" - -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_tunables_t kibnal_tunables; - -kib_data_t kibnal_data = { - .kib_service_id = IBNAL_SERVICE_NUMBER, -}; - -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 - -#define IBNAL_SYSCTL_TIMEOUT 1 - -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif - -#ifdef unused -void -print_service(IB_SERVICE_RECORD *service, char *tag, int rc) -{ - char name[32]; - - if (service == NULL) - { - CWARN("tag : %s\n" - "status : %d (NULL)\n", tag, rc); - return; - } - strncpy (name, service->ServiceName, sizeof(name)-1); - name[sizeof(name)-1] = 0; - - CWARN("tag : %s\n" - "status : %d\n" - "service id: "LPX64"\n" - "name : %s\n" - "NID : "LPX64"\n", tag, rc, - service->RID.ServiceID, name, - *kibnal_service_nid_field(service)); -} -#endif - -static void -kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, - FSTATUS frc, uint32 madrc) -{ - *(FSTATUS *)arg = frc; - up (&kibnal_data.kib_nid_signal); -} - -#if IBNAL_CHECK_ADVERT -static void -kibnal_service_query_done (void *arg, QUERY *qry, - QUERY_RESULT_VALUES *qry_result) -{ - FSTATUS frc = qry_result->Status; - - if (frc != FSUCCESS && - qry_result->ResultDataSize == 0) - frc = FERROR; - - *(FSTATUS *)arg = frc; - up (&kibnal_data.kib_nid_signal); -} - -static void -kibnal_check_advert (void) -{ - QUERY *qry; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; - - PORTAL_ALLOC(qry, sizeof(*qry)); - if (qry == NULL) - return; - - memset (qry, 0, sizeof(*qry)); - qry->InputType = InputTypeServiceRecord; - qry->OutputType = OutputTypeServiceRecord; - qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; - svc = &qry->InputValue.ServiceRecordValue.ServiceRecord; - kibnal_set_service_keys(svc, kibnal_data.kib_nid); - - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - qry, - kibnal_service_query_done, - NULL, &frc2); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d checking SM service\n", frc); - } else { - down (&kibnal_data.kib_nid_signal); - frc = frc2; - - if (frc != 0) - CERROR ("Error %d checking SM service\n", rc); - } - - return (rc); -} -#endif - -static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) -{ - IB_SERVICE_RECORD *svc; - - memset (fod, 0, sizeof(*fod)); - fod->Type = type; - - svc = &fod->Value.ServiceRecordValue.ServiceRecord; - svc->RID.ServiceID = kibnal_data.kib_service_id; - svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; - svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; - svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; - svc->ServiceLease = 0xffffffff; - - kibnal_set_service_keys(svc, kibnal_data.kib_nid); -} - -static int -kibnal_advertise (void) -{ - FABRIC_OPERATION_DATA *fod; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - - PORTAL_ALLOC(fod, sizeof(*fod)); - if (fod == NULL) - return (-ENOMEM); - - fill_fod(fod, FabOpSetServiceRecord); - svc = &fod->Value.ServiceRecordValue.ServiceRecord; - - CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - svc->RID.ServiceID, - svc->ServiceName, *kibnal_service_nid_field(svc)); - - frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - fod, kibnal_service_setunset_done, - NULL, &frc2); - - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d advertising NID "LPX64"\n", - frc, kibnal_data.kib_nid); - goto out; - } - - down (&kibnal_data.kib_nid_signal); - - frc = frc2; - if (frc != FSUCCESS) - CERROR ("Error %d advertising BUD "LPX64"\n", - frc, kibnal_data.kib_nid); -out: - PORTAL_FREE(fod, sizeof(*fod)); - return (frc == FSUCCESS) ? 0 : -EINVAL; -} - -static void -kibnal_unadvertise (int expect_success) -{ - FABRIC_OPERATION_DATA *fod; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - - PORTAL_ALLOC(fod, sizeof(*fod)); - if (fod == NULL) - return; - - fill_fod(fod, FabOpDeleteServiceRecord); - svc = &fod->Value.ServiceRecordValue.ServiceRecord; - - CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - svc->ServiceName, *kibnal_service_nid_field(svc)); - - frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - fod, kibnal_service_setunset_done, - NULL, &frc2); - - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - frc, kibnal_data.kib_nid); - goto out; - } - - down (&kibnal_data.kib_nid_signal); - - if ((frc2 == FSUCCESS) == !!expect_success) - goto out; - - if (expect_success) - CERROR("Error %d unadvertising NID "LPX64"\n", - frc2, kibnal_data.kib_nid); - else - CWARN("Removed conflicting NID "LPX64"\n", - kibnal_data.kib_nid); - out: - PORTAL_FREE(fod, sizeof(*fod)); -} - -static int -kibnal_set_mynid(ptl_nid_t nid) -{ - struct timeval tv; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; - FSTATUS frc; - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - do_gettimeofday(&tv); - - down (&kibnal_data.kib_nid_mutex); - - if (nid == kibnal_data.kib_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); - - if (kibnal_data.kib_nid != PTL_NID_ANY) { - - kibnal_unadvertise (1); - - frc = iibt_cm_cancel(kibnal_data.kib_cep); - if (frc != FSUCCESS && frc != FPENDING) - CERROR ("Error %d stopping listener\n", frc); - - frc = iibt_cm_destroy_cep(kibnal_data.kib_cep); - if (frc != FSUCCESS) - CERROR ("Error %d destroying CEP\n", frc); - - kibnal_data.kib_cep = NULL; - } - - kibnal_data.kib_nid = ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (kibnal_data.kib_nid == PTL_NID_ANY) { - /* No new NID to install */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - /* remove any previous advert (crashed node etc) */ - kibnal_unadvertise(0); - - kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE); - if (kibnal_data.kib_cep == NULL) { - CERROR ("Can't create CEP\n"); - rc = -ENOMEM; - } else { - CM_LISTEN_INFO info; - memset (&info, 0, sizeof(info)); - info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id; - - frc = iibt_cm_listen(kibnal_data.kib_cep, &info, - kibnal_listen_callback, NULL); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("iibt_cm_listen error: %d\n", frc); - rc = -EINVAL; - } else { - rc = 0; - } - } - - if (rc == 0) { - rc = kibnal_advertise(); - if (rc == 0) { -#if IBNAL_CHECK_ADVERT - kibnal_check_advert(); -#endif - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - iibt_cm_cancel (kibnal_data.kib_cep); - iibt_cm_destroy_cep (kibnal_data.kib_cep); - /* remove any peers that sprung up while I failed to - * advertise myself */ - kibnal_del_peer (PTL_NID_ANY, 0); - } - - kibnal_data.kib_nid = PTL_NID_ANY; - up (&kibnal_data.kib_nid_mutex); - return (rc); -} - -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - LASSERT (nid != PTL_NID_ANY); - - PORTAL_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) - return (NULL); - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->ibp_nid = nid; - atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD (&peer->ibp_conns); - INIT_LIST_HEAD (&peer->ibp_tx_queue); - - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - - atomic_inc (&kibnal_data.kib_npeers); - return (peer); -} - -void -kibnal_destroy_peer (kib_peer_t *peer) -{ - - LASSERT (atomic_read (&peer->ibp_refcount) == 0); - LASSERT (peer->ibp_persistence == 0); - LASSERT (!kibnal_peer_active(peer)); - LASSERT (peer->ibp_connecting == 0); - LASSERT (list_empty (&peer->ibp_conns)); - LASSERT (list_empty (&peer->ibp_tx_queue)); - - PORTAL_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec (&kibnal_data.kib_npeers); -} - -/* the caller is responsible for accounting for the additional reference - * that this creates */ -kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) -{ - struct list_head *peer_list = kibnal_nid2peerlist (nid); - struct list_head *tmp; - kib_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, kib_peer_t, ibp_list); - - LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ - peer->ibp_connecting != 0 || /* creating conns */ - !list_empty (&peer->ibp_conns)); /* active conn */ - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); - return (peer); - } - return (NULL); -} - -kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - read_lock (&kibnal_data.kib_global_lock); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - kib_peer_addref(peer); - read_unlock (&kibnal_data.kib_global_lock); - - return (peer); -} - -void -kibnal_unlink_peer_locked (kib_peer_t *peer) -{ - LASSERT (peer->ibp_persistence == 0); - LASSERT (list_empty(&peer->ibp_conns)); - - LASSERT (kibnal_peer_active(peer)); - list_del_init (&peer->ibp_list); - /* lose peerlist's ref */ - kib_peer_decref(peer); -} - -static int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) -{ - kib_peer_t *peer; - struct list_head *ptmp; - int i; - - read_lock (&kibnal_data.kib_global_lock); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *persistencep = peer->ibp_persistence; - - read_unlock (&kibnal_data.kib_global_lock); - return (0); - } - } - - read_unlock (&kibnal_data.kib_global_lock); - return (-ENOENT); -} - -static int -kibnal_add_persistent_peer (ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_peer_t *peer2; - - if (nid == PTL_NID_ANY) - return (-EINVAL); - - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - peer2 = kibnal_find_peer_locked (nid); - if (peer2 != NULL) { - kib_peer_decref (peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist (nid)); - } - - peer->ibp_persistence++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (0); -} - -static void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kib_conn_t *conn; - - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; - - if (peer->ibp_persistence != 0) - return; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); - - kibnal_close_conn_locked (conn, 0); - } - - /* NB peer unlinks itself when last conn is closed */ -} - -int -kibnal_del_peer (ptl_nid_t nid, int single_share) -{ - unsigned long flags; - struct list_head *ptmp; - struct list_head *pnxt; - kib_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) - continue; - - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ - - if (single_share) - goto out; - } - } - out: - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - return (rc); -} - -static kib_conn_t * -kibnal_get_conn_by_idx (int index) -{ - kib_peer_t *peer; - struct list_head *ptmp; - kib_conn_t *conn; - struct list_head *ctmp; - int i; - - read_lock (&kibnal_data.kib_global_lock); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence > 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - list_for_each (ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - read_unlock (&kibnal_data.kib_global_lock); - return (conn); - } - } - } - - read_unlock (&kibnal_data.kib_global_lock); - return (NULL); -} - -kib_conn_t * -kibnal_create_conn (void) -{ - kib_conn_t *conn; - int i; - __u64 vaddr = 0; - __u64 vaddr_base; - int page_offset; - int ipage; - int rc; - FSTATUS frc; - union { - IB_QP_ATTRIBUTES_CREATE qp_create; - IB_QP_ATTRIBUTES_MODIFY qp_attr; - } params; - - PORTAL_ALLOC (conn, sizeof (*conn)); - if (conn == NULL) { - CERROR ("Can't allocate connection\n"); - return (NULL); - } - - /* zero flags, NULL pointers etc... */ - memset (conn, 0, sizeof (*conn)); - - INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_active_txs); - spin_lock_init (&conn->ibc_lock); - - atomic_inc (&kibnal_data.kib_nconns); - /* well not really, but I call destroy() on failure, which decrements */ - - PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) - goto failed; - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); - if (rc != 0) - goto failed; - - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - if (kibnal_whole_mem()) - rx->rx_vaddr = kibnal_page2phys(page) + - page_offset + - kibnal_data.kib_md.md_addr; - else - rx->rx_vaddr = vaddr; - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_RX_MSG_PAGES); - } - } - - params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { - .Type = QPTypeReliableConnected, - .SendQDepth = IBNAL_TX_MAX_SG * - IBNAL_MSG_QUEUE_SIZE, - .RecvQDepth = IBNAL_MSG_QUEUE_SIZE, - .SendDSListDepth = 1, - .RecvDSListDepth = 1, - .SendCQHandle = kibnal_data.kib_cq, - .RecvCQHandle = kibnal_data.kib_cq, - .PDHandle = kibnal_data.kib_pd, - .SendSignaledCompletions = TRUE, - }; - frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL, - &conn->ibc_qp, &conn->ibc_qp_attrs); - if (rc != 0) { - CERROR ("Failed to create queue pair: %d\n", rc); - goto failed; - } - - /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - - params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateInit, - .Attrs = (IB_QP_ATTR_PORTGUID | - IB_QP_ATTR_PKEYINDEX | - IB_QP_ATTR_ACCESSCONTROL), - .PortGUID = kibnal_data.kib_port_guid, - .PkeyIndex = 0, - .AccessControl = { - .s = { - .RdmaWrite = 1, - .RdmaRead = 1, - }, - }, - }; - rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL); - if (rc != 0) { - CERROR ("Failed to modify queue pair: %d\n", rc); - goto failed; - } - - /* 1 ref for caller */ - atomic_set (&conn->ibc_refcount, 1); - return (conn); - - failed: - kibnal_destroy_conn (conn); - return (NULL); -} - -void -kibnal_destroy_conn (kib_conn_t *conn) -{ - int rc; - FSTATUS frc; - - CDEBUG (D_NET, "connection %p\n", conn); - - LASSERT (atomic_read (&conn->ibc_refcount) == 0); - LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_active_txs)); - LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); - - switch (conn->ibc_state) { - case IBNAL_CONN_DISCONNECTED: - /* called after connection sequence initiated */ - /* fall through */ - - case IBNAL_CONN_INIT_QP: - /* _destroy includes an implicit Reset of the QP which - * discards posted work */ - rc = iibt_qp_destroy(conn->ibc_qp); - if (rc != 0) - CERROR("Can't destroy QP: %d\n", rc); - /* fall through */ - - case IBNAL_CONN_INIT_NOTHING: - break; - - default: - LASSERT (0); - } - - if (conn->ibc_cep != NULL) { - frc = iibt_cm_destroy_cep(conn->ibc_cep); - if (frc != 0) - CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, - frc); - } - - if (conn->ibc_rx_pages != NULL) - kibnal_free_pages(conn->ibc_rx_pages); - - if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, - IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - if (conn->ibc_peer != NULL) - kib_peer_decref(conn->ibc_peer); - - PORTAL_FREE(conn, sizeof (*conn)); - - atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_connd_waitq); - } -} - -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* must disconnect before dropping the final ref */ - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); -} - -static int -kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - count++; - kibnal_close_conn_locked (conn, why); - } - - return (count); -} - -int -kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - if (conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->ibp_nid, conn->ibc_incarnation, incarnation); - - count++; - kibnal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -static int -kibnal_close_matching_conns (ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kibnal_close_peer_conns_locked (peer, 0); - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == PTL_NID_ANY) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -static int -kibnal_cmd(struct portals_cfg *pcfg, void * private) -{ - int rc = -EINVAL; - ENTRY; - - LASSERT (pcfg != NULL); - - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); - break; - } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); - break; - } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); - } - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); - break; - } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) - rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); - break; - } - } - - RETURN(rc); -} - -void -kibnal_free_pages (kib_pages_t *p) -{ - int npages = p->ibp_npages; - int rc; - int i; - - if (p->ibp_mapped) { - rc = iibt_deregister_memory(p->ibp_handle); - if (rc != 0) - CERROR ("Deregister error: %d\n", rc); - } - - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); - - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); -} - -int -kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) -{ - kib_pages_t *p; - __u64 *phys_pages; - int i; - FSTATUS frc; - IB_ACCESS_CONTROL access; - - memset(&access, 0, sizeof(access)); - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; - - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); - if (p == NULL) { - CERROR ("Can't allocate buffer %d\n", npages); - return (-ENOMEM); - } - - memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->ibp_pages[i] == NULL) { - CERROR ("Can't allocate page %d of %d\n", i, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - } - - if (kibnal_whole_mem()) - goto out; - - PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); - if (phys_pages == NULL) { - CERROR ("Can't allocate physarray for %d pages\n", npages); - /* XXX free ibp_pages? */ - kibnal_free_pages(p); - return (-ENOMEM); - } - - /* if we were using the _contig_ registration variant we would have - * an array of PhysAddr/Length pairs, but the discontiguous variant - * just takes the PhysAddr */ - for (i = 0; i < npages; i++) - phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]); - - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - 0, /* requested vaddr */ - phys_pages, npages, - 0, /* offset */ - kibnal_data.kib_pd, - access, - &p->ibp_handle, &p->ibp_vaddr, - &p->ibp_lkey, &p->ibp_rkey); - - PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); - - if (frc != FSUCCESS) { - CERROR ("Error %d mapping %d pages\n", frc, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - - CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" " - "lkey %x rkey %x\n", npages, p->ibp_handle, - p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); - - p->ibp_mapped = 1; -out: - *pp = p; - return (0); -} - -static int -kibnal_setup_tx_descs (void) -{ - int ipage = 0; - int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; - struct page *page; - kib_tx_t *tx; - int i; - int rc; - - /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, - 0); - if (rc != 0) - return (rc); - - /* ignored for the whole_mem case */ - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - - for (i = 0; i < IBNAL_TX_MSGS; i++) { - page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; - tx = &kibnal_data.kib_tx_descs[i]; - - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); - - if (kibnal_whole_mem()) - tx->tx_vaddr = kibnal_page2phys(page) + - page_offset + - kibnal_data.kib_md.md_addr; - else - tx->tx_vaddr = vaddr; - - tx->tx_isnblk = (i >= IBNAL_NTX); - tx->tx_mapped = KIB_TX_UNMAPPED; - - CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", - i, tx, tx->tx_msg, tx->tx_vaddr); - - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); - } - } - - return (0); -} - -static void -kibnal_api_shutdown (nal_t *nal) -{ - int i; - int rc; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &kibnal_api); - - switch (kibnal_data.kib_init) { - default: - CERROR ("Unexpected state %d\n", kibnal_data.kib_init); - LBUG(); - - case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(IIBNAL); - /* No new peers */ - - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kibnal_set_mynid (PTL_NID_ANY); - - /* Wait for all peer state to clean up (crazy) */ - i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect (can take a few seconds)\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_CQ: - rc = iibt_cq_destroy(kibnal_data.kib_cq); - if (rc != 0) - CERROR ("Destroy CQ error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_TXD: - kibnal_free_pages (kibnal_data.kib_tx_pages); - /* fall through */ - - case IBNAL_INIT_MR: - if (kibnal_data.kib_md.md_handle != NULL) { - rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle); - if (rc != FSUCCESS) - CERROR ("Deregister memory: %d\n", rc); - } - /* fall through */ - -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif - case IBNAL_INIT_PD: - rc = iibt_pd_free(kibnal_data.kib_pd); - if (rc != 0) - CERROR ("Destroy PD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_SD: - rc = iibt_sd_deregister(kibnal_data.kib_sd); - if (rc != 0) - CERROR ("Deregister SD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_PORT: - /* XXX ??? */ - /* fall through */ - - case IBNAL_INIT_PORTATTRS: - PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, - kibnal_data.kib_hca_attrs.PortAttributesListSize); - /* fall through */ - - case IBNAL_INIT_HCA: - rc = iibt_close_hca(kibnal_data.kib_hca); - if (rc != 0) - CERROR ("Close HCA error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); - /* fall through */ - - case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); - LASSERT (kibnal_data.kib_peers != NULL); - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - LASSERT (list_empty (&kibnal_data.kib_peers[i])); - } - LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); - LASSERT (list_empty (&kibnal_data.kib_connd_conns)); - LASSERT (list_empty (&kibnal_data.kib_connd_peers)); - - /* flag threads to terminate; wake and wait for them to die */ - kibnal_data.kib_shutdown = 1; - wake_up_all (&kibnal_data.kib_sched_waitq); - wake_up_all (&kibnal_data.kib_connd_waitq); - - i = 2; - while (atomic_read (&kibnal_data.kib_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_NOTHING: - break; - } - - if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - - if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * - kibnal_data.kib_peer_hash_size); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); - - kibnal_data.kib_init = IBNAL_INIT_NOTHING; -} - -#define roundup_power(val, power) \ - ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) - -/* this isn't very portable or sturdy in the face of funny mem/bus configs */ -static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr) -{ - struct sysinfo si; - __u64 ret; - - /* XXX we don't bother with first-gen cards */ - if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101) - return 0ULL; - - si_meminfo(&si); - ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; - return roundup_power(ret, 128 * 1024 * 1024); -} -#undef roundup_power - -static int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - IB_PORT_ATTRIBUTES *pattr; - FSTATUS frc; - int rc; - int n; - int i; - - LASSERT (nal == &kibnal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } - - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); - - frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, - &kibnal_data.kib_interfaces); - if (frc != FSUCCESS) { - CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n", - frc); - return -ENOSYS; - } - - init_MUTEX (&kibnal_data.kib_nid_mutex); - init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); - kibnal_data.kib_nid = PTL_NID_ANY; - - rwlock_init(&kibnal_data.kib_global_lock); - - kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, - sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); - if (kibnal_data.kib_peers == NULL) { - goto failed; - } - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); - - spin_lock_init (&kibnal_data.kib_connd_lock); - INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); - INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); - init_waitqueue_head (&kibnal_data.kib_connd_waitq); - - spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); - init_waitqueue_head (&kibnal_data.kib_sched_waitq); - - spin_lock_init (&kibnal_data.kib_tx_lock); - INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); - goto failed; - } - - /* lists/ptrs/locks initialised */ - kibnal_data.kib_init = IBNAL_INIT_DATA; - /*****************************************************/ - - process_id.pid = 0; - process_id.nid = kibnal_data.kib_nid; - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ - - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)i); - if (rc != 0) { - CERROR("Can't spawn iibnal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - rc = kibnal_thread_start (kibnal_connd, NULL); - if (rc != 0) { - CERROR ("Can't spawn iibnal connd: %d\n", rc); - goto failed; - } - - n = sizeof(kibnal_data.kib_hca_guids) / - sizeof(kibnal_data.kib_hca_guids[0]); - frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids); - if (frc != FSUCCESS) { - CERROR ("Can't get channel adapter guids: %d\n", frc); - goto failed; - } - if (n == 0) { - CERROR ("No channel adapters found\n"); - goto failed; - } - - /* Infinicon has per-HCA rather than per CQ completion handlers */ - frc = iibt_open_hca(kibnal_data.kib_hca_guids[0], - kibnal_ca_callback, - kibnal_ca_async_callback, - &kibnal_data.kib_hca, - &kibnal_data.kib_hca); - if (frc != FSUCCESS) { - CERROR ("Can't open CA[0]: %d\n", frc); - goto failed; - } - - /* Channel Adapter opened */ - kibnal_data.kib_init = IBNAL_INIT_HCA; - /*****************************************************/ - - kibnal_data.kib_hca_attrs.PortAttributesList = NULL; - kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; - frc = iibt_query_hca(kibnal_data.kib_hca, - &kibnal_data.kib_hca_attrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't size port attrs: %d\n", frc); - goto failed; - } - - PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, - kibnal_data.kib_hca_attrs.PortAttributesListSize); - if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) - goto failed; - - /* Port attrs allocated */ - kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; - /*****************************************************/ - - frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, - NULL); - if (frc != FSUCCESS) { - CERROR ("Can't get port attrs for CA 0: %d\n", frc); - goto failed; - } - - for (i = 0, pattr = kibnal_data.kib_hca_attrs.PortAttributesList; - pattr != NULL; - i++, pattr = pattr->Next) { - switch (pattr->PortState) { - default: - CERROR("Unexpected port[%d] state %d\n", - i, pattr->PortState); - continue; - case PortStateDown: - CDEBUG(D_NET, "port[%d] Down\n", i); - continue; - case PortStateInit: - CDEBUG(D_NET, "port[%d] Init\n", i); - continue; - case PortStateArmed: - CDEBUG(D_NET, "port[%d] Armed\n", i); - continue; - - case PortStateActive: - CDEBUG(D_NET, "port[%d] Active\n", i); - kibnal_data.kib_port = i; - kibnal_data.kib_port_guid = pattr->GUID; - kibnal_data.kib_port_pkey = pattr->PkeyTable[0]; - break; - } - break; - } - - if (pattr == NULL) { - CERROR ("Can't find an active port\n"); - goto failed; - } - - CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); - - /* Active port found */ - kibnal_data.kib_init = IBNAL_INIT_PORT; - /*****************************************************/ - - frc = iibt_sd_register(&kibnal_data.kib_sd, NULL); - if (frc != FSUCCESS) { - CERROR ("Can't register with SD: %d\n", frc); - goto failed; - } - - /* Registered with SD OK */ - kibnal_data.kib_init = IBNAL_INIT_SD; - /*****************************************************/ - - frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); - if (frc != FSUCCESS) { - CERROR ("Can't create PD: %d\n", rc); - goto failed; - } - - /* flag PD initialised */ - kibnal_data.kib_init = IBNAL_INIT_PD; - /*****************************************************/ - -#if IBNAL_FMR - { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - /*****************************************************/ - if (IBNAL_WHOLE_MEM) { - IB_MR_PHYS_BUFFER phys; - IB_ACCESS_CONTROL access; - kib_md_t *md = &kibnal_data.kib_md; - - memset(&access, 0, sizeof(access)); - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; - - phys.PhysAddr = 0; - phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs); - if (phys.Length == 0) { - CERROR ("couldn't determine the end of phys mem\n"); - goto failed; - } - - rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca, - 0, - &phys, 1, - 0, - kibnal_data.kib_pd, - access, - &md->md_handle, - &md->md_addr, - &md->md_lkey, - &md->md_rkey); - if (rc != FSUCCESS) { - CERROR("registering physical memory failed: %d\n", - rc); - CERROR("falling back to registration per-rdma\n"); - md->md_handle = NULL; - } else { - CDEBUG(D_NET, "registered "LPU64" bytes of mem\n", - phys.Length); - kibnal_data.kib_init = IBNAL_INIT_MR; - } - } - - /*****************************************************/ - - rc = kibnal_setup_tx_descs(); - if (rc != 0) { - CERROR ("Can't register tx descs: %d\n", rc); - goto failed; - } - - /* flag TX descs initialised */ - kibnal_data.kib_init = IBNAL_INIT_TXD; - /*****************************************************/ - - { - uint32 nentries; - - frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, - &kibnal_data.kib_cq, &kibnal_data.kib_cq, - &nentries); - if (frc != FSUCCESS) { - CERROR ("Can't create RX CQ: %d\n", frc); - goto failed; - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; - - if (nentries < IBNAL_CQ_ENTRIES) { - CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES); - goto failed; - } - - rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC); - if (rc != 0) { - CERROR ("Failed to re-arm completion queue: %d\n", rc); - goto failed; - } - } - - /*****************************************************/ - - rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - goto failed; - } - - /* flag everything initialised */ - kibnal_data.kib_init = IBNAL_INIT_ALL; - /*****************************************************/ - - printk(KERN_INFO "Lustre: Infinicon IB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); - - failed: - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); -} - -void __exit -kibnal_module_fini (void) -{ -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(IIBNAL); -} - -int __init -kibnal_module_init (void) -{ - int rc; - - if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) { - CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n"); - return -EINVAL; - } - - /* the following must be sizeof(int) for proc_dointvec() */ - if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { - CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); - return -EINVAL; - } - - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - - rc = ptl_register_nal(IIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(IIBNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init(kibnal_module_init); -module_exit(kibnal_module_fini); - diff --git a/lustre/portals/knals/iibnal/iibnal.h b/lustre/portals/knals/iibnal/iibnal.h deleted file mode 100644 index 3242158..0000000 --- a/lustre/portals/knals/iibnal/iibnal.h +++ /dev/null @@ -1,892 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include - -#include - -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) - -/* Test for GCC > 3.2.2 */ -#if GCC_VERSION <= 30202 -/* GCC 3.2.2, and presumably several versions before it, will - * miscompile this driver. See - * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ -#error Invalid GCC version. Must use GCC >= 3.2.3 -#endif - -#define IBNAL_SERVICE_NAME "iibnal" -#define IBNAL_SERVICE_NUMBER 0x11b9a1 - -#if CONFIG_SMP -# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define IBNAL_N_SCHED 1 /* # schedulers */ -#endif - -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ -/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ -#define IBNAL_RETRY 5 /* # times to retry */ -#define IBNAL_RNR_RETRY 5 /* */ -#define IBNAL_CM_RETRY 5 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ - -#define IBNAL_NTX 64 /* # tx descs */ -/* this had to be dropped down so that we only register < 255 pages per - * region. this will change if we register all memory. */ -#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ - -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ - -/************************/ -/* derived constants... */ - -/* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) - -/* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - - -/* we may have up to 2 completions per transmit + - 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ - (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_FMR 0 -#define IBNAL_WHOLE_MEM 1 -#define IBNAL_CKSUM 0 -//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT - -/* XXX I have no idea. */ -#define IBNAL_STARTING_PSN 1 - -typedef struct -{ - int kib_io_timeout; /* comms timeout (seconds) */ - struct ctl_table_header *kib_sysctl; /* sysctl interface */ -} kib_tunables_t; - -/* some of these have specific types in the stack that just map back - * to the uFOO types, like IB_{L,R}_KEY. */ -typedef struct -{ - int ibp_npages; /* # pages */ - int ibp_mapped; /* mapped? */ - __u64 ibp_vaddr; /* mapped region vaddr */ - __u32 ibp_lkey; /* mapped region lkey */ - __u32 ibp_rkey; /* mapped region rkey */ - IB_HANDLE ibp_handle; /* mapped region handle */ - struct page *ibp_pages[0]; -} kib_pages_t; - -typedef struct -{ - IB_HANDLE md_handle; - __u32 md_lkey; - __u32 md_rkey; - __u64 md_addr; -} kib_md_t __attribute__((packed)); - -typedef struct -{ - int kib_init; /* initialisation state */ - __u64 kib_incarnation; /* which one am I */ - int kib_shutdown; /* shut down? */ - atomic_t kib_nthreads; /* # live threads */ - - __u64 kib_service_id; /* service number I listen on */ - __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ - __u16 kib_port_pkey; /* my pkey, whatever that is */ - ptl_nid_t kib_nid; /* my NID */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ - struct semaphore kib_nid_signal; /* signal completion */ - IB_HANDLE kib_cep; /* connection end point */ - - rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - atomic_t kib_npeers; /* # peers extant */ - atomic_t kib_nconns; /* # connections extant */ - - struct list_head kib_connd_conns; /* connections to progress */ - struct list_head kib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ - unsigned long kib_connd_waketime; /* when connd will wake */ - spinlock_t kib_connd_lock; /* serialise */ - - wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - struct list_head kib_sched_txq; /* tx requiring attention */ - struct list_head kib_sched_rxq; /* rx requiring attention */ - spinlock_t kib_sched_lock; /* serialise */ - - struct kib_tx *kib_tx_descs; /* all the tx descriptors */ - kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ - - struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ - __u64 kib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kib_tx_lock; /* serialise */ - - IB_HANDLE kib_hca; /* The HCA */ - int kib_port; /* port on the device */ - IB_HANDLE kib_pd; /* protection domain */ - IB_HANDLE kib_sd; /* SD handle */ - IB_HANDLE kib_cq; /* completion queue */ - kib_md_t kib_md; /* full-mem registration */ - - void *kib_listen_handle; /* where I listen for connections */ - - IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */ - - uint64 kib_hca_guids[8]; /* all the HCA guids */ - IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ - FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */ -} kib_data_t; - -#define IBNAL_INIT_NOTHING 0 -#define IBNAL_INIT_DATA 1 -#define IBNAL_INIT_LIB 2 -#define IBNAL_INIT_HCA 3 -#define IBNAL_INIT_PORTATTRS 4 -#define IBNAL_INIT_PORT 5 -#define IBNAL_INIT_SD 6 -#define IBNAL_INIT_PD 7 -#define IBNAL_INIT_FMR 8 -#define IBNAL_INIT_MR 9 -#define IBNAL_INIT_TXD 10 -#define IBNAL_INIT_CQ 11 -#define IBNAL_INIT_ALL 12 - -/************************************************************************ - * Wire message structs. - * These are sent in sender's byte order (i.e. receiver flips). - * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD - * private data and SM service info), is LE on the wire. - */ - -/* also kib_md_t above */ - -typedef struct -{ - __u32 rd_key; /* remote key */ - __u32 rd_nob; /* # of bytes */ - __u64 rd_addr; /* remote io vaddr */ -} kib_rdma_desc_t __attribute__((packed)); - -typedef struct -{ - ptl_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} kib_immediate_msg_t __attribute__((packed)); - -/* these arrays serve two purposes during rdma. they are built on the passive - * side and sent to the active side as remote arguments. On the active side - * the descs are used as a data structure on the way to local gather items. - * the different roles result in split local/remote meaning of desc->rd_key */ -typedef struct -{ - ptl_hdr_t ibrm_hdr; /* portals header */ - __u64 ibrm_cookie; /* opaque completion cookie */ - __u32 ibrm_num_descs; /* how many descs */ - kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ -} kib_rdma_msg_t __attribute__((packed)); - -#define kib_rdma_msg_len(num_descs) \ - offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) - -typedef struct -{ - __u64 ibcm_cookie; /* opaque completion cookie */ - __u32 ibcm_status; /* completion status */ -} kib_completion_msg_t __attribute__((packed)); - -typedef struct -{ - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ -#if IBNAL_CKSUM - __u32 ibm_nob; - __u32 ibm_cksum; -#endif - union { - kib_immediate_msg_t immediate; - kib_rdma_msg_t rdma; - kib_completion_msg_t completion; - } ibm_u __attribute__((packed)); -} kib_msg_t __attribute__((packed)); - -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 1 /* current protocol version */ - -#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ - -/***********************************************************************/ - -typedef struct kib_rx /* receive message */ -{ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_rdma; /* RDMA completion posted? */ - int rx_posted; /* posted? */ - __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - IB_WORK_REQ rx_wrq; - IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */ -} kib_rx_t; - -typedef struct kib_tx /* transmit message */ -{ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ - struct kib_conn *tx_conn; /* owning conn */ - int tx_mapped; /* mapped for RDMA? */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_status; /* completion status */ - unsigned long tx_deadline; /* completion deadline */ - int tx_passive_rdma; /* peer sucks/blows */ - int tx_passive_rdma_wait; /* waiting for peer to complete */ - __u64 tx_passive_rdma_cookie; /* completion cookie */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - kib_md_t tx_md; /* RDMA mapping (active/passive) */ - __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ - int tx_nsp; /* # send work items */ - IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ - IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ -} kib_tx_t; - -#define KIB_TX_UNMAPPED 0 -#define KIB_TX_MAPPED 1 -#define KIB_TX_MAPPED_FMR 2 - -typedef struct kib_wire_connreq -{ - __u32 wcr_magic; /* I'm an openibnal connreq */ - __u16 wcr_version; /* this is my version number */ - __u16 wcr_queue_depth; /* this is my receive queue size */ - __u64 wcr_nid; /* peer's NID */ - __u64 wcr_incarnation; /* peer's incarnation */ -} kib_wire_connreq_t; - -typedef struct kib_gid -{ - __u64 hi, lo; -} kib_gid_t; - -typedef struct kib_connreq -{ - /* connection-in-progress */ - struct kib_conn *cr_conn; - kib_wire_connreq_t cr_wcr; - __u64 cr_tid; - IB_SERVICE_RECORD cr_service; - kib_gid_t cr_gid; - IB_PATH_RECORD cr_path; - CM_REQUEST_INFO cr_cmreq; - CM_CONN_INFO cr_discarded; - CM_REJECT_INFO cr_rej_info; -} kib_connreq_t; - -typedef struct kib_conn -{ - struct kib_peer *ibc_peer; /* owning peer */ - struct list_head ibc_list; /* stash on peer's conn list */ - __u64 ibc_incarnation; /* which instance of the peer */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - atomic_t ibc_nob; /* # bytes buffered */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_rcvd_disconnect;/* received discon request */ - int ibc_sent_disconnect;/* sent discon request */ - struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - kib_rx_t *ibc_rxs; /* the rx descs */ - kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - IB_HANDLE ibc_qp; /* queue pair */ - IB_HANDLE ibc_cep; /* connection ID? */ - IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */ - kib_connreq_t *ibc_connreq; /* connection request state */ -} kib_conn_t; - -#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define IBNAL_CONN_CONNECTING 2 /* started to connect */ -#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ -#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ -#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ -#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ - -#define KIB_ASSERT_CONN_STATE(conn, state) do { \ - LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ -} while (0) - -#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ - LASSERTF(low <= high, "%d %d\n", low, high); \ - LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ - "%d\n", conn->ibc_state); \ -} while (0) - -typedef struct kib_peer -{ - struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - ptl_nid_t ibp_nid; /* who's on the other end(s) */ - atomic_t ibp_refcount; /* # users */ - int ibp_persistence; /* "known" peer refs */ - struct list_head ibp_conns; /* all active connections */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* connecting+accepting */ - unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ - unsigned long ibp_reconnect_interval; /* exponential backoff */ -} kib_peer_t; - - -extern lib_nal_t kibnal_lib; -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; - -/******************************************************************************/ -/* Infinicon IBT interface wrappers */ -#define IIBT_IF (kibnal_data.kib_interfaces.ver2) - -static inline FSTATUS -iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list) -{ - return IIBT_IF.GetCaGuids(hca_count, hca_guid_list); -} - -static inline FSTATUS -iibt_open_hca(EUI64 hca_guid, - IB_COMPLETION_CALLBACK completion_callback, - IB_ASYNC_EVENT_CALLBACK async_event_callback, - void *arg, - IB_HANDLE *handle) -{ - return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback, - async_event_callback, arg, handle); -} - -static inline FSTATUS -iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp) -{ - return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp); -} - -static inline FSTATUS -iibt_close_hca(IB_HANDLE hca_handle) -{ - return IIBT_IF.Vpi.CloseCA(hca_handle); -} - -static inline FSTATUS -iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle) -{ - return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle); -} - -static inline FSTATUS -iibt_pd_free(IB_HANDLE pd_handle) -{ - return IIBT_IF.Vpi.FreePD(pd_handle); -} - -static inline FSTATUS -iibt_register_physical_memory(IB_HANDLE hca_handle, - IB_VIRT_ADDR requested_io_va, - void *phys_buffers, uint64 nphys_buffers, - uint32 io_va_offset, IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_VIRT_ADDR *actual_io_va, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va, - phys_buffers, nphys_buffers, - io_va_offset, pd_handle, - access, - mem_handle, actual_io_va, - lkey, rkey); -} - -static inline FSTATUS -iibt_register_contig_physical_memory(IB_HANDLE hca_handle, - IB_VIRT_ADDR requested_io_va, - IB_MR_PHYS_BUFFER *phys_buffers, - uint64 nphys_buffers, - uint32 io_va_offset, IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_VIRT_ADDR *actual_io_va, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, - requested_io_va, - phys_buffers, - nphys_buffers, - io_va_offset, pd_handle, - access, - mem_handle, actual_io_va, - lkey, rkey); -} - -static inline FSTATUS -iibt_register_memory(IB_HANDLE hca_handle, - void *virt_addr, unsigned int length, - IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, - virt_addr, length, - pd_handle, - access, - mem_handle, - lkey, rkey); -} - -static inline FSTATUS -iibt_deregister_memory(IB_HANDLE mem_handle) -{ - return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle); -} - -static inline FSTATUS -iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size, - void *arg, IB_HANDLE *cq_handle, uint32 *actual_size) -{ - return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size, - arg, cq_handle, actual_size); -} - -static inline FSTATUS -iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc) -{ - return IIBT_IF.Vpi.PollCQ(cq_handle, wc); -} - -static inline FSTATUS -iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select) -{ - return IIBT_IF.Vpi.RearmCQ(cq_handle, select); -} - -static inline FSTATUS -iibt_cq_destroy(IB_HANDLE cq_handle) -{ - return IIBT_IF.Vpi.DestroyCQ(cq_handle); -} - -static inline FSTATUS -iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr, - void *arg, IB_HANDLE *cq_handle, - IB_QP_ATTRIBUTES_QUERY *query_attr) -{ - return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, - query_attr); -} - -static inline FSTATUS -iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr, - void **arg_ptr) -{ - return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr); -} - -static inline FSTATUS -iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr, - IB_QP_ATTRIBUTES_QUERY *query_attr) -{ - return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr); -} - -static inline FSTATUS -iibt_qp_destroy(IB_HANDLE qp_handle) -{ - return IIBT_IF.Vpi.DestroyQP(qp_handle); -} - -static inline FSTATUS -iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) -{ - return IIBT_IF.Vpi.PostRecv(qp_handle, work_req); -} - -static inline FSTATUS -iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) -{ - return IIBT_IF.Vpi.PostSend(qp_handle, work_req); -} - -static inline FSTATUS -iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p) -{ - return IIBT_IF.Sdi.Register(sd_handle, p); -} - -static inline FSTATUS -iibt_sd_deregister(IB_HANDLE sd_handle) -{ - return IIBT_IF.Sdi.Deregister(sd_handle); -} - -static inline FSTATUS -iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid, - FABRIC_OPERATION_DATA *fod, - PFABRIC_OPERATION_CALLBACK callback, - COMMAND_CONTROL_PARAMETERS *p, void *arg) -{ - return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid, - fod, callback, p, arg); -} - -static inline FSTATUS -iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid, - QUERY *qry, - PQUERY_CALLBACK callback, - COMMAND_CONTROL_PARAMETERS *p, void *arg) -{ - return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid, - qry, callback, p, arg); -} - -static inline IB_HANDLE -iibt_cm_create_cep(CM_CEP_TYPE type) -{ - return IIBT_IF.Cmi.CmCreateCEP(type); -} - -static inline FSTATUS -iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len, - uint32 offset) -{ - return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset); -} - -static inline FSTATUS -iibt_cm_destroy_cep(IB_HANDLE cep_handle) -{ - return IIBT_IF.Cmi.CmDestroyCEP(cep_handle); -} - -static inline FSTATUS -iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info, - PFN_CM_CALLBACK callback, void *arg) -{ - return IIBT_IF.Cmi.CmListen(cep, info, callback, arg); -} - -static inline FSTATUS -iibt_cm_cancel(IB_HANDLE cep) -{ - return IIBT_IF.Cmi.CmCancel(cep); -} - -static inline FSTATUS -iibt_cm_accept(IB_HANDLE cep, - CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info, - PFN_CM_CALLBACK callback, void *arg, - IB_HANDLE *new_cep) -{ - return IIBT_IF.Cmi.CmAccept(cep, - send_info, recv_info, - callback, arg, new_cep); -} - -static inline FSTATUS -iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej) -{ - return IIBT_IF.Cmi.CmReject(cep, rej); -} - -static inline FSTATUS -iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req, - CM_DREPLY_INFO *reply) -{ - return IIBT_IF.Cmi.CmDisconnect(cep, req, reply); -} - -static inline FSTATUS -iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req, - PFN_CM_CALLBACK callback, void *arg) -{ - return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg); -} - -static inline int wrq_signals_completion(IB_WORK_REQ *wrq) -{ - return wrq->Req.SendRC.Options.s.SignaledCompletion == 1; -} - - -/******************************************************************************/ - -/* these are purposely avoiding using local vars so they don't increase - * stack consumption. */ - -#define kib_peer_addref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - atomic_inc(&peer->ibp_refcount); \ -} while (0) - -#define kib_peer_decref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - if (atomic_dec_and_test (&peer->ibp_refcount)) { \ - CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ - peer->ibp_nid, peer); \ - kibnal_destroy_peer (peer); \ - } \ -} while (0) - -/******************************************************************************/ - -static inline struct list_head * -kibnal_nid2peerlist (ptl_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - - return (&kibnal_data.kib_peers [hash]); -} - -static inline int -kibnal_peer_active(kib_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->ibp_list)); -} - -static inline void -kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) -{ - /* CAVEAT EMPTOR: tx takes caller's ref on conn */ - - LASSERT (tx->tx_nsp > 0); /* work items set up */ - LASSERT (tx->tx_conn == NULL); /* only set here */ - - tx->tx_conn = conn; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); -} - -#define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_1 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_2 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_3 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_4 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_5 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_6 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_7 | \ - IB_SERVICE_RECORD_COMP_SERVICEDATA8_8) - -static inline __u64* -kibnal_service_nid_field(IB_SERVICE_RECORD *srv) -{ - /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ - return (__u64 *)srv->ServiceData8; -} - - -static inline void -kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid) -{ - LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName)); - memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); - strcpy (srv->ServiceName, IBNAL_SERVICE_NAME); - - *kibnal_service_nid_field(srv) = cpu_to_le64(nid); -} - -#if 0 -static inline void -kibnal_show_rdma_attr (kib_conn_t *conn) -{ - struct ib_qp_attribute qp_attr; - int rc; - - memset (&qp_attr, 0, sizeof(qp_attr)); - rc = ib_qp_query(conn->ibc_qp, &qp_attr); - if (rc != 0) { - CERROR ("Can't get qp attrs: %d\n", rc); - return; - } - - CWARN ("RDMA CAPABILITY: write %s read %s\n", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); -} -#endif - -#if CONFIG_X86 -static inline __u64 -kibnal_page2phys (struct page *p) -{ - __u64 page_number = p - mem_map; - - return (page_number << PAGE_SHIFT); -} -#else -# error "no page->phys" -#endif - -/* CAVEAT EMPTOR: - * We rely on tx/rx descriptor alignment to allow us to use the lowest bit - * of the work request id as a flag to determine if the completion is for a - * transmit or a receive. It seems that that the CQ entry's 'op' field - * isn't always set correctly on completions that occur after QP teardown. */ - -static inline __u64 -kibnal_ptr2wreqid (void *ptr, int isrx) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT ((lptr & 1) == 0); - return (__u64)(lptr | (isrx ? 1 : 0)); -} - -static inline void * -kibnal_wreqid2ptr (__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~1UL); -} - -static inline int -kibnal_wreqid_is_rx (__u64 wreqid) -{ - return (wreqid & 1) != 0; -} - -static inline int -kibnal_whole_mem(void) -{ - return kibnal_data.kib_md.md_handle != NULL; -} - -extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); -extern void kibnal_destroy_peer (kib_peer_t *peer); -extern int kibnal_del_peer (ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); -extern void kibnal_unlink_peer_locked (kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_put_conn (kib_conn_t *conn); -extern void kibnal_destroy_conn (kib_conn_t *conn); -void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); - -extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages (kib_pages_t *p); - -extern void kibnal_check_sends (kib_conn_t *conn); -extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd (void *arg); -extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -extern void kibnal_close_conn (kib_conn_t *conn, int why); -extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); - -void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev); -void kibnal_ca_callback (void *ca_arg, void *cq_arg); diff --git a/lustre/portals/knals/iibnal/iibnal_cb.c b/lustre/portals/knals/iibnal/iibnal_cb.c deleted file mode 100644 index a827ba5..0000000 --- a/lustre/portals/knals/iibnal/iibnal_cb.c +++ /dev/null @@ -1,3018 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "iibnal.h" - -/* - * LIB functions follow - * - */ -static void -kibnal_schedule_tx_done (kib_tx_t *tx) -{ - unsigned long flags; - - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -static void -kibnal_tx_done (kib_tx_t *tx) -{ - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - unsigned long flags; - int i; - FSTATUS frc; - - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ - - switch (tx->tx_mapped) { - default: - LBUG(); - - case KIB_TX_UNMAPPED: - break; - - case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - frc = iibt_deregister_memory(tx->tx_md.md_handle); - LASSERT (frc == FSUCCESS); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; - -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); - - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; -#endif - } - - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; - - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } - - if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; - tx->tx_status = 0; - - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); -} - -static kib_tx_t * -kibnal_get_idle_tx (int may_block) -{ - unsigned long flags; - kib_tx_t *tx = NULL; - ENTRY; - - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } - - /* block for idle tx */ - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); - } - - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - RETURN(tx); -} - -static int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -static void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) -{ - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) - continue; - - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); - - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); - - if (idle) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); - return; - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); -} - -static __u32 -kibnal_lkey(kib_pages_t *ibp) -{ - if (kibnal_whole_mem()) - return kibnal_data.kib_md.md_lkey; - - return ibp->ibp_lkey; -} - -static void -kibnal_post_rx (kib_rx_t *rx, int do_credits) -{ - kib_conn_t *conn = rx->rx_conn; - int rc = 0; - unsigned long flags; - FSTATUS frc; - ENTRY; - - rx->rx_gl = (IB_LOCAL_DATASEGMENT) { - .Address = rx->rx_vaddr, - .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(conn->ibc_rx_pages), - }; - - rx->rx_wrq = (IB_WORK_REQ) { - .Operation = WROpRecv, - .DSListDepth = 1, - .MessageLen = IBNAL_MSG_SIZE, - .WorkReqId = kibnal_ptr2wreqid(rx, 1), - .DSList = &rx->rx_gl, - }; - - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DREP); - LASSERT (!rx->rx_posted); - rx->rx_posted = 1; - mb(); - - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else { - frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); - if (frc != FSUCCESS) { - CDEBUG(D_NET, "post failed %d\n", frc); - rc = -EINVAL; - } - CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); - } - - if (rc == 0) { - if (do_credits) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_outstanding_credits++; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - EXIT; - return; - } - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - } - - /* Drop rx's ref */ - kibnal_put_conn (conn); - EXIT; -} - -#if IBNAL_CKSUM -static inline __u32 kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - return (sum); -} -#endif - -static void hexdump(char *string, void *ptr, int len) -{ - unsigned char *c = ptr; - int i; - - return; - - if (len < 0 || len > 2048) { - printk("XXX what the hell? %d\n",len); - return; - } - - printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); - - for (i = 0; i < len;) { - printk("%02x",*(c++)); - i++; - if (!(i & 15)) { - printk("\n"); - } else if (!(i&1)) { - printk(" "); - } - } - - if(len & 15) { - printk("\n"); - } -} - -static void -kibnal_rx_callback (IB_WORK_COMPLETION *wc) -{ - kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int nob = wc->Length; - const int base_nob = offsetof(kib_msg_t, ibm_u); - int credits; - int flipped; - unsigned long flags; - __u32 i; -#if IBNAL_CKSUM - __u32 msg_cksum; - __u32 computed_cksum; -#endif - - /* we set the QP to erroring after we've finished disconnecting, - * maybe we should do so sooner. */ - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DISCONNECTED); - - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; - mb(); - - /* receives complete with error in any case after we've started - * disconnecting */ - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto failed; - - if (wc->Status != WRStatusSuccess) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); - goto failed; - } - - if (nob < base_nob) { - CERROR ("Short rx from "LPX64": %d < expected %d\n", - conn->ibc_peer->ibp_nid, nob, base_nob); - goto failed; - } - - hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); - - /* Receiver does any byte flipping if necessary... */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flipped = 0; - } else { - if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->ibm_magic, conn->ibc_peer->ibp_nid); - goto failed; - } - flipped = 1; - __swab16s (&msg->ibm_version); - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); - } - - if (msg->ibm_version != IBNAL_MSG_VERSION) { - CERROR ("Incompatible msg version %d (%d expected)\n", - msg->ibm_version, IBNAL_MSG_VERSION); - goto failed; - } - -#if IBNAL_CKSUM - if (nob != msg->ibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); - goto failed; - } - - msg_cksum = le32_to_cpu(msg->ibm_cksum); - msg->ibm_cksum = 0; - computed_cksum = kibnal_cksum (msg, nob); - - if (msg_cksum != computed_cksum) { - CERROR ("Checksum failure %d: (%d expected)\n", - computed_cksum, msg_cksum); -// goto failed; - } - CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); -#endif - - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; - if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); - return; - - case IBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (kib_immediate_msg_t)) { - CERROR ("Short IMMEDIATE from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (kib_rdma_msg_t)) { - CERROR ("Short RDMA msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32(msg->ibm_u.rdma.ibrm_num_descs); - - CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); - - if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || - (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > - min(nob, IBNAL_MSG_SIZE))) { - CERROR ("num_descs %d too large\n", - msg->ibm_u.rdma.ibrm_num_descs); - goto failed; - } - - for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; - - if (flipped) { - __swab32(desc->rd_key); - __swab32(desc->rd_nob); - __swab64(desc->rd_addr); - } - - CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", - desc->rd_key, desc->rd_addr, desc->rd_nob); - } - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (kib_completion_msg_t)) { - CERROR ("Short COMPLETION msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32s(&msg->ibm_u.completion.ibcm_status); - - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); - return; - - default: - CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->ibm_type); - goto failed; - } - - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); - - /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); -} - -void -kibnal_rx (kib_rx_t *rx) -{ - kib_msg_t *msg = rx->rx_msg; - - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; - - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; - - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); - break; - - case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - break; - - case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); - break; - - default: - LBUG(); - break; - } - - kibnal_post_rx (rx, 1); -} - -static struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); - - if (!VALID_PAGE (page)) - page = NULL; - - return page; -} - -static void -kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, - unsigned long len, int active) -{ - kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; - kib_rdma_desc_t *desc; - - LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", - ibrm->ibrm_num_descs); - - desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; - if (active) - desc->rd_key = kibnal_data.kib_md.md_lkey; - else - desc->rd_key = kibnal_data.kib_md.md_rkey; - desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ - desc->rd_addr = kibnal_page2phys(page) + page_offset + - kibnal_data.kib_md.md_addr; - - ibrm->ibrm_num_descs++; -} - -static int -kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) -{ - struct page *page; - int page_offset, len; - - while (nob > 0) { - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) - return -EFAULT; - - page_offset = vaddr & (PAGE_SIZE - 1); - len = min(nob, (int)PAGE_SIZE - page_offset); - - kibnal_fill_ibrm(tx, page, page_offset, len, active); - nob -= len; - vaddr += len; - } - return 0; -} - -static int -kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int niov, struct iovec *iov, int offset, int nob, int active) - -{ - void *vaddr; - FSTATUS frc; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } - - /* our large contiguous iov could be backed by multiple physical - * pages. */ - if (kibnal_whole_mem()) { - int rc; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + - offset, nob, active); - if (rc != 0) { - CERROR ("Can't map iov: %d\n", rc); - return rc; - } - return 0; - } - - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - - frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, - kibnal_data.kib_pd, access, - &tx->tx_md.md_handle, &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - if (frc != 0) { - CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); - return -EINVAL; - } - - tx->tx_mapped = KIB_TX_MAPPED; - return (0); -} - -static int -kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int nkiov, ptl_kiov_t *kiov, - int offset, int nob, int active) -{ - __u64 *phys = NULL; - int page_offset; - int nphys; - int resid; - int phys_size = 0; - FSTATUS frc; - int i, rc = 0; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - page_offset = kiov->kiov_offset + offset; - nphys = 1; - - if (!kibnal_whole_mem()) { - phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - - phys[0] = kibnal_page2phys(kiov->kiov_page); - } else { - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, - kiov->kiov_len, active); - } - - resid = nob - (kiov->kiov_len - offset); - - while (resid > 0) { - kiov++; - nkiov--; - LASSERT (nkiov > 0); - - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - /* Can't have gaps */ - CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - { - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } - - rc = -EINVAL; - goto out; - } - - if (nphys == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - if (!kibnal_whole_mem()) { - LASSERT (nphys * sizeof (*phys) < phys_size); - phys[nphys] = kibnal_page2phys(kiov->kiov_page); - } else { - if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - kibnal_fill_ibrm(tx, kiov->kiov_page, - kiov->kiov_offset, kiov->kiov_len, - active); - } - - nphys ++; - resid -= PAGE_SIZE; - } - - if (kibnal_whole_mem()) - goto out; - -#if 0 - CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); - for (i = 0; i < nphys; i++) - CWARN (" [%d] "LPX64"\n", i, phys[i]); -#endif - -#if IBNAL_FMR -#error "iibnal hasn't learned about FMR yet" - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - IBNAL_RDMA_BASE, - phys, nphys, - 0, /* offset */ - kibnal_data.kib_pd, - access, - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (frc == FSUCCESS) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); -#if IBNAL_FMR - tx->tx_mapped = KIB_TX_MAPPED_FMR; -#else - tx->tx_mapped = KIB_TX_MAPPED; -#endif - } else { - CERROR ("Can't map phys: %d\n", rc); - rc = -EFAULT; - } - - out: - if (phys != NULL) - PORTAL_FREE(phys, phys_size); - return (rc); -} - -static kib_conn_t * -kibnal_find_conn_locked (kib_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, kib_conn_t, ibc_list)); - } - - return (NULL); -} - -void -kibnal_check_sends (kib_conn_t *conn) -{ - unsigned long flags; - kib_tx_t *tx; - int rc; - int i; - int done; - int nwork; - ENTRY; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); - - if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = kibnal_get_idle_tx(0); /* don't block */ - if (tx != NULL) - kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); - kibnal_queue_tx_locked(tx, conn); - } - } - - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); - - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); - - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) - GOTO(out, 0); - - if (conn->ibc_credits == 0) /* no credits */ - GOTO(out, 1); - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - GOTO(out, 2); - - list_del (&tx->tx_list); - - if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { - /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); - continue; - } - - tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; - conn->ibc_outstanding_credits = 0; - - conn->ibc_nsends_posted++; - conn->ibc_credits--; - - /* we only get a tx completion for the final rdma op */ - tx->tx_sending = min(tx->tx_nsp, 2); - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; - list_add (&tx->tx_list, &conn->ibc_active_txs); -#if IBNAL_CKSUM - tx->tx_msg->ibm_cksum = 0; - tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); -#endif - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - - LASSERT (tx->tx_nsp > 0); - - rc = -ECONNABORTED; - nwork = 0; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - /* Driver only accepts 1 item at a time */ - for (i = 0; i < tx->tx_nsp; i++) { - hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); - rc = iibt_postsend(conn->ibc_qp, - &tx->tx_wrq[i]); - if (rc != 0) - break; - if (wrq_signals_completion(&tx->tx_wrq[i])) - nwork++; - CDEBUG(D_NET, "posted tx wrq %p\n", - &tx->tx_wrq[i]); - } - } - - spin_lock_irqsave (&conn->ibc_lock, flags); - if (rc != 0) { - /* NB credits are transferred in the actual - * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; - conn->ibc_nsends_posted--; - - tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - tx->tx_sending -= tx->tx_nsp - nwork; - - done = (tx->tx_sending == 0); - if (done) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); - else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); - - kibnal_close_conn (conn, rc); - - if (done) - kibnal_tx_done (tx); - return; - } - - } - - EXIT; -out: - spin_unlock_irqrestore (&conn->ibc_lock, flags); -} - -static void -kibnal_tx_callback (IB_WORK_COMPLETION *wc) -{ - kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - kib_conn_t *conn; - unsigned long flags; - int idle; - - conn = tx->tx_conn; - LASSERT (conn != NULL); - LASSERT (tx->tx_sending != 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_sending, tx->tx_nsp, wc->Status); - - /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ - - tx->tx_sending--; - idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ - if (idle) - list_del(&tx->tx_list); - - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - - if (tx->tx_sending == 0) - conn->ibc_nsends_posted--; - - if (wc->Status != WRStatusSuccess && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - if (idle) - kibnal_tx_done (tx); - - if (wc->Status != WRStatusSuccess) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); - kibnal_close_conn (conn, -ENETDOWN); - } else { - /* can I shovel some more sends out the door? */ - kibnal_check_sends(conn); - } - - kibnal_put_conn (conn); -} - -void -kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) -{ - /* XXX flesh out. this seems largely for async errors */ - CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); -} - -void -kibnal_ca_callback (void *ca_arg, void *cq_arg) -{ - IB_HANDLE cq = *(IB_HANDLE *)cq_arg; - IB_HANDLE ca = *(IB_HANDLE *)ca_arg; - IB_WORK_COMPLETION wc; - int armed = 0; - - CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); - - for(;;) { - while (iibt_cq_poll(cq, &wc) == FSUCCESS) { - if (kibnal_wreqid_is_rx(wc.WorkReqId)) - kibnal_rx_callback(&wc); - else - kibnal_tx_callback(&wc); - } - if (armed) - return; - if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { - CERROR("rearm failed?\n"); - return; - } - armed = 1; - } -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; - IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); - LASSERT (nob <= IBNAL_MSG_SIZE); - - tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; - tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; - tx->tx_msg->ibm_type = type; -#if IBNAL_CKSUM - tx->tx_msg->ibm_nob = nob; -#endif - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); - - *gl = (IB_LOCAL_DATASEGMENT) { - .Address = tx->tx_vaddr, - .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), - }; - - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); - wrq->Operation = WROpSend; - wrq->DSList = gl; - wrq->DSListDepth = 1; - wrq->MessageLen = nob; - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 1; - wrq->Req.SendRC.Options.s.SignaledCompletion = 1; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = fence; - - tx->tx_nsp++; -} - -static void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) -{ - unsigned long flags; - - spin_lock_irqsave(&conn->ibc_lock, flags); - - kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); -} - -static void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_conn_t *conn; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ - - read_lock (g_lock); - - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock (g_lock); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - read_unlock (g_lock); - - kibnal_queue_tx (tx, conn); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock (g_lock); - write_lock_irqsave (g_lock, flags); - - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - write_unlock_irqrestore (g_lock, flags); - - kibnal_queue_tx (tx, conn); - return; - } - - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - peer->ibp_connecting = 1; - kib_peer_addref(peer); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); - } - - /* A connection is being established; queue the message... */ - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - - write_unlock_irqrestore (g_lock, flags); -} - -static ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) -{ - int nob = libmsg->md->length; - kib_tx_t *tx; - kib_msg_t *ibmsg; - int rc; - IB_ACCESS_CONTROL access = {0,}; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ - - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; - - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); - - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob, 0); - else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob, 0); - - if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); - rc = -ENOMEM; - goto failed; - } - } - - tx->tx_passive_rdma = 1; - - ibmsg = tx->tx_msg; - - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - /* map_kiov alrady filled the rdma descs for the whole_mem case */ - if (!kibnal_whole_mem()) { - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - ibmsg->ibm_u.rdma.ibrm_num_descs = 1; - } - - kibnal_init_tx_msg (tx, type, - kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); - - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; - - kibnal_launch_tx(tx, nid); - return (PTL_OK); - - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (PTL_FAIL); -} - -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - IB_ACCESS_CONTROL access = {0,}; - IB_WR_OP rdma_op; - int rc; - __u32 i; - - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); - - /* Called by scheduler */ - LASSERT (!in_interrupt ()); - - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; - - if (type == IBNAL_MSG_GET_DONE) { - rdma_op = WROpRdmaWrite; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access.s.LocalWrite = 1; - rdma_op = WROpRdmaRead; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); - } - - tx = kibnal_get_idle_tx (0); /* Mustn't block */ - if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 - " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); - return; - } - LASSERT (tx->tx_nsp == 0); - - if (nob == 0) - GOTO(init_tx, 0); - - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); - else - rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); - - if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - if (!kibnal_whole_mem()) { - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; - } - - /* XXX ugh. different page-sized hosts. */ - if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != - rxmsg->ibm_u.rdma.ibrm_num_descs) { - CERROR("tx descs (%u) != rx descs (%u)\n", - tx->tx_msg->ibm_u.rdma.ibrm_num_descs, - rxmsg->ibm_u.rdma.ibrm_num_descs); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - /* map_kiov filled in the rdma descs which describe our side of the - * rdma transfer. */ - /* ibrm_num_descs was verified in rx_callback */ - for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ - IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; - IB_WORK_REQ *wrq = &tx->tx_wrq[i]; - - ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; - rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; - - ds->Address = ldesc->rd_addr; - ds->Length = ldesc->rd_nob; - ds->Lkey = ldesc->rd_key; - - memset(wrq, 0, sizeof(*wrq)); - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); - wrq->Operation = rdma_op; - wrq->DSList = ds; - wrq->DSListDepth = 1; - wrq->MessageLen = ds->Length; - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 0; - wrq->Req.SendRC.Options.s.SignaledCompletion = 0; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; - wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key; - - /* only the last rdma post triggers tx completion */ - if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) - wrq->Req.SendRC.Options.s.SignaledCompletion = 1; - - tx->tx_nsp++; - } - -init_tx: - txmsg = tx->tx_msg; - - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; - - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; - } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); - } - - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ - kibnal_queue_tx(tx, rx->rx_conn); -} - -static ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - switch (type) { - default: - LBUG(); - return (PTL_FAIL); - - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - return (PTL_OK); - } - - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); - } - - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob >= IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", - nid, payload_nob); - return (PTL_FAIL); - } - break; - } - - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; - - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); - - break; - } - - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); - if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } - - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); - - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; - - kibnal_launch_tx(tx, nid); - return (PTL_OK); -} - -static ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} - -static ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); -} - -static ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch (rxmsg->ibm_type) { - default: - LBUG(); - return (PTL_FAIL); - - case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); - } - - if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); - - case IBNAL_MSG_GET_RDMA: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); - - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); - return (PTL_OK); - } -} - -static ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); -} - -static ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); -} - -/***************************************************************************** - * the rest of this file concerns connection management. active connetions - * start with connect_peer, passive connections start with passive_callback. - * active disconnects start with conn_close, cm_callback starts passive - * disconnects and contains the guts of how the disconnect state machine - * progresses. - *****************************************************************************/ - -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} - -static void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); -} - -/* this can be called by anyone at any time to close a connection. if - * the connection is still established it heads to the connd to start - * the disconnection in a safe context. It has no effect if called - * on a connection that is already disconnecting */ -void -kibnal_close_conn_locked (kib_conn_t *conn, int error) -{ - /* This just does the immmediate housekeeping, and schedules the - * connection for the connd to finish off. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; - - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, - IBNAL_CONN_DISCONNECTED); - - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - return; /* already disconnecting */ - - CDEBUG (error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_connd_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_connd_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - } - - if (list_empty (&peer->ibp_conns) && - peer->ibp_persistence == 0) { - /* Non-persistent peer with no more conns... */ - kibnal_unlink_peer_locked (peer); - } - - conn->ibc_state = IBNAL_CONN_SEND_DREQ; - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); -} - -void -kibnal_close_conn (kib_conn_t *conn, int error) -{ - unsigned long flags; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - kibnal_close_conn_locked (conn, error); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); -} - -static void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) -{ - LIST_HEAD (zombies); - kib_tx_t *tx; - unsigned long flags; - - LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; - - if (peer->ibp_connecting != 0) { - /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return; - } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); - - /* Take peer's blocked blocked transmits; I'll complete - * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); - - while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - } -} - -static void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) -{ - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int i; - - /* passive connection has no connreq & vice versa */ - LASSERTF(!active == !(conn->ibc_connreq != NULL), - "%d %p\n", active, conn->ibc_connreq); - if (active) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting != 0); - - if (status == 0) { - /* connection established... */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_ESTABLISHED; - - if (!kibnal_peer_active(peer)) { - /* ...but peer deleted meantime */ - status = -ECONNABORTED; - } - } else { - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, - IBNAL_CONN_CONNECTING); - } - - if (status == 0) { - /* Everything worked! */ - - peer->ibp_connecting--; - - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - kibnal_queue_tx_locked (tx, conn); - } - - spin_unlock (&conn->ibc_lock); - - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - - CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, - conn->ibc_rxs[i].rx_vaddr); - - kibnal_post_rx (&conn->ibc_rxs[i], 0); - } - - kibnal_check_sends (conn); - return; - } - - /* connection failed */ - if (state == IBNAL_CONN_CONNECTING) { - /* schedule for connd to close */ - kibnal_close_conn_locked (conn, status); - } else { - /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_connect_failed (conn->ibc_peer, active, status); - - /* If we didn't establish the connection we don't have to pass - * through the disconnect protocol before dropping the CM ref */ - if (state < IBNAL_CONN_CONNECTING) - kibnal_put_conn (conn); -} - -static int -kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, - ptl_nid_t nid, __u64 incarnation, int queue_depth) -{ - kib_conn_t *conn = kibnal_create_conn(); - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - - if (conn == NULL) - return (-ENOMEM); - - if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-EPROTO); - } - - /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (nid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-ENOMEM); - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - peer2 = kibnal_find_peer_locked(nid); - if (peer2 == NULL) { - /* peer table takes my ref on peer */ - list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); - } else { - kib_peer_decref (peer); - peer = peer2; - } - - kib_peer_addref(peer); /* +1 ref for conn */ - peer->ibp_connecting++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - /* conn->ibc_cep is set when cm_accept is called */ - conn->ibc_incarnation = incarnation; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - *connp = conn; - return (0); -} - -static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; - FSTATUS frc; - - modify_attr.RequestState = state; - - frc = iibt_qp_modify(qp, &modify_attr, NULL); - if (frc != FSUCCESS) - CERROR("couldn't set qp state to %d, error %d\n", state, frc); -} - -static void kibnal_flush_pending(kib_conn_t *conn) -{ - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - int done; - - /* NB we wait until the connection has closed before completing - * outstanding passive RDMAs so we can be sure the network can't - * touch the mapped memory any more. */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); - - /* set the QP to the error state so that we get flush callbacks - * on our posted receives which can then drop their conn refs */ - kibnal_set_qp_state(conn->ibc_qp, QPStateError); - - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - if (!done) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del(&tx->tx_list); - kibnal_tx_done (tx); - } -} - -static void -kibnal_reject (IB_HANDLE cep, uint16_t reason) -{ - CM_REJECT_INFO *rej; - - PORTAL_ALLOC(rej, sizeof(*rej)); - if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ - return; - - rej->Reason = reason; - iibt_cm_reject(cep, rej); - PORTAL_FREE(rej, sizeof(*rej)); -} - -static FSTATUS -kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, - IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr; - FSTATUS frc; - ENTRY; - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToRecv, - .RecvPSN = IBNAL_STARTING_PSN, - .DestQPNumber = qpn, - .ResponderResources = resp_res, - .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ - .Attrs = (IB_QP_ATTR_RECVPSN | - IB_QP_ATTR_DESTQPNUMBER | - IB_QP_ATTR_RESPONDERRESOURCES | - IB_QP_ATTR_DESTAV | - IB_QP_ATTR_PATHMTU | - IB_QP_ATTR_MINRNRTIMER), - }; - GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, - &modify_attr.DestAV); - - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - if (frc != FSUCCESS) - RETURN(frc); - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToSend, - .FlowControl = TRUE, - .InitiatorDepth = init_depth, - .SendPSN = send_psn, - .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .Attrs = (IB_QP_ATTR_FLOWCONTROL | - IB_QP_ATTR_INITIATORDEPTH | - IB_QP_ATTR_SENDPSN | - IB_QP_ATTR_LOCALACKTIMEOUT | - IB_QP_ATTR_RETRYCOUNT | - IB_QP_ATTR_RNRRETRYCOUNT), - }; - - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - RETURN(frc); -} - -static void -kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - kib_wire_connreq_t *wcr; - CM_REPLY_INFO *rep = &info->Info.Reply; - uint16_t reason; - FSTATUS frc; - - wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't connect "LPX64": bad magic %08x\n", - conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't connect "LPX64": bad version %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); - } - - if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { - CERROR ("Can't connect "LPX64": bad queue depth %d\n", - conn->ibc_peer->ibp_nid, - le16_to_cpu(wcr->wcr_queue_depth)); - GOTO(reject, reason = RC_USER_REJ); - } - - if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { - CERROR ("Unexpected NID "LPX64" from "LPX64"\n", - le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - GOTO(reject, reason = RC_USER_REJ); - } - - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, - min_t(__u8, rep->ArbInitiatorDepth, - ca_attr->MaxQPResponderResources), - &conn->ibc_connreq->cr_path, - min_t(__u8, rep->ArbResponderResources, - ca_attr->MaxQPInitiatorDepth), - rep->StartingPSN); - if (frc != FSUCCESS) { - CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - GOTO(reject, reason = RC_NO_QP); - } - - /* the callback arguments are ignored for an active accept */ - conn->ibc_connreq->cr_discarded.Status = FSUCCESS; - frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, - NULL, NULL, NULL, NULL); - if (frc != FCM_CONNECT_ESTABLISHED) { - CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - kibnal_connreq_done (conn, 1, -ECONNABORTED); - /* XXX don't call reject after accept fails? */ - return; - } - - CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 1, 0); - return; - -reject: - kibnal_reject(cep, reason); - kibnal_connreq_done (conn, 1, -EPROTO); -} - -/* ib_cm.h has a wealth of information on the CM procedures */ -static void -kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - kib_conn_t *conn = arg; - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - /* Established Connection Notifier */ - switch (info->Status) { - default: - CERROR("unknown status %d on Connection %p -> "LPX64"\n", - info->Status, conn, conn->ibc_peer->ibp_nid); - LBUG(); - break; - - case FCM_CONNECT_REPLY: - kibnal_connect_reply(cep, info, arg); - break; - - case FCM_DISCONNECT_REQUEST: - /* XXX lock around these state management bits? */ - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - kibnal_close_conn (conn, 0); - conn->ibc_state = IBNAL_CONN_DREP; - iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); - break; - - /* these both guarantee that no more cm callbacks will occur */ - case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ - case FCM_DISCONNECT_REPLY: - CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - kibnal_flush_pending(conn); - kibnal_put_conn(conn); /* Lose CM's ref */ - break; - } - - return; -} - -static int -kibnal_set_cm_flags(IB_HANDLE cep) -{ - FSTATUS frc; - uint32 value = 1; - - frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, - (char *)&value, sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting timeout callback: %d\n", frc); - return -1; - } - -#if 0 - frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, - sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting async accept: %d\n", frc); - return -1; - } -#endif - - return 0; -} - -void -kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - IB_QP_ATTRIBUTES_QUERY *query; - CM_REQUEST_INFO *req; - CM_CONN_INFO *rep = NULL, *rcv = NULL; - kib_wire_connreq_t *wcr; - kib_conn_t *conn = NULL; - uint16_t reason = 0; - FSTATUS frc; - int rc = 0; - - LASSERT(cep); - LASSERT(info); - LASSERT(arg == NULL); /* no conn yet for passive */ - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - req = &info->Info.Request; - wcr = (kib_wire_connreq_t *)req->PrivateData; - - CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, - le64_to_cpu(wcr->wcr_nid)); - - if (info->Status == FCM_CONNECT_CANCEL) - return; - - LASSERT (info->Status == FCM_CONNECT_REQUEST); - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't accept: bad magic %08x\n", - le32_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't accept: bad version %d\n", - le16_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - rc = kibnal_accept(&conn, cep, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); - if (rc != 0) { - CERROR ("Can't accept "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), rc); - GOTO(out, reason = RC_NO_RESOURCES); - } - - frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, - min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, - ca_attr->MaxQPResponderResources), - &req->PathInfo.Path, - min_t(__u8, req->CEPInfo.OfferedResponderResources, - ca_attr->MaxQPInitiatorDepth), - req->CEPInfo.StartingPSN); - - if (frc != FSUCCESS) { - CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - - frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Couldn't query qp attributes "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - query = &conn->ibc_qp_attrs; - - PORTAL_ALLOC(rep, sizeof(*rep)); - PORTAL_ALLOC(rcv, sizeof(*rcv)); - if (rep == NULL || rcv == NULL) { - CERROR ("can't reply and receive buffers\n"); - GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); - } - - /* don't try to deref this into the incoming wcr :) */ - wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; - - rep->Info.Reply = (CM_REPLY_INFO) { - .QPN = query->QPNumber, - .QKey = query->Qkey, - .StartingPSN = query->RecvPSN, - .EndToEndFlowControl = query->FlowControl, - /* XXX Hmm. */ - .ArbInitiatorDepth = query->InitiatorDepth, - .ArbResponderResources = query->ResponderResources, - .TargetAckDelay = 0, - .FailoverAccepted = 0, - .RnRRetryCount = req->CEPInfo.RnrRetryCount, - }; - - *wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, - &conn->ibc_cep); - - PORTAL_FREE(rep, sizeof(*rep)); - PORTAL_FREE(rcv, sizeof(*rcv)); - - if (frc != FCM_CONNECT_ESTABLISHED) { - /* XXX it seems we don't call reject after this point? */ - CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); - rc = -ECONNABORTED; - goto out; - } - - if (kibnal_set_cm_flags(conn->ibc_cep)) { - rc = -ECONNABORTED; - goto out; - } - - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); - -out: - if (reason) { - kibnal_reject(cep, reason); - rc = -ECONNABORTED; - } - if (conn != NULL) - kibnal_connreq_done(conn, 0, rc); - - return; -} - -static void -dump_path_records(PATH_RESULTS *results) -{ - IB_PATH_RECORD *path; - int i; - - for(i = 0; i < results->NumPathRecords; i++) { - path = &results->PathRecords[i]; - CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " - LPX64":"LPX64" pkey %x\n", - i, - path->SGID.Type.Global.SubnetPrefix, - path->SGID.Type.Global.InterfaceID, - path->DGID.Type.Global.SubnetPrefix, - path->DGID.Type.Global.InterfaceID, - path->P_Key); - } -} - -static void -kibnal_pathreq_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - PATH_RESULTS *path; - FSTATUS frc; - - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - path = (PATH_RESULTS *)query_res->QueryResult; - - if (path->NumPathRecords < 1) { - CERROR ("expected path records: %d\n", path->NumPathRecords); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - dump_path_records(path); - - /* just using the first. this is probably a horrible idea. */ - conn->ibc_connreq->cr_path = path->PathRecords[0]; - - conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); - if (conn->ibc_cep == NULL) { - CERROR ("Can't create CEP\n"); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - if (kibnal_set_cm_flags(conn->ibc_cep)) { - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { - .SID = conn->ibc_connreq->cr_service.RID.ServiceID, - .CEPInfo = (CM_CEP_INFO) { - .CaGUID = kibnal_data.kib_hca_guids[0], - .EndToEndFlowControl = FALSE, - .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .AckTimeout = IBNAL_ACK_TIMEOUT, - .StartingPSN = IBNAL_STARTING_PSN, - .QPN = conn->ibc_qp_attrs.QPNumber, - .QKey = conn->ibc_qp_attrs.Qkey, - .OfferedResponderResources = ca_attr->MaxQPResponderResources, - .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, - }, - .PathInfo = (CM_CEP_PATHINFO) { - .bSubnetLocal = TRUE, - .Path = conn->ibc_connreq->cr_path, - }, - }; - -#if 0 - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; -#endif - /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; - - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - - memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, - CM_REQUEST_INFO_USER_LEN); - memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, - &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); - - /* kibnal_cm_callback gets my conn ref */ - frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, - kibnal_cm_callback, conn); - if (frc != FPENDING && frc != FSUCCESS) { - CERROR ("Connect: %d\n", frc); - /* Back out state change as connect failed */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, -EINVAL); - } -} - -static void -dump_service_records(SERVICE_RECORD_RESULTS *results) -{ - IB_SERVICE_RECORD *svc; - int i; - - for(i = 0; i < results->NumServiceRecords; i++) { - svc = &results->ServiceRecords[i]; - CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", - i, - svc->RID.ServiceID, - svc->RID.ServiceGID.Type.Global.SubnetPrefix, - svc->RID.ServiceGID.Type.Global.InterfaceID, - svc->RID.ServiceP_Key); - } -} - - -static void -kibnal_service_get_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) -{ - kib_conn_t *conn = arg; - SERVICE_RECORD_RESULTS *svc; - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY path_query; - FSTATUS frc; - - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; - - if (svc->NumServiceRecords < 1) { - CERROR ("%d service records\n", svc->NumServiceRecords); - kibnal_connreq_done (conn, 1, -EINVAL); - return; - } - - dump_service_records(svc); - - conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; - - CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", - query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - - memset(&path_query, 0, sizeof(path_query)); - path_query.InputType = InputTypePortGuidPair; - path_query.OutputType = OutputTypePathRecord; - path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; - path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; - - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ - - /* kibnal_service_get_callback gets my conn ref */ - - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &path_query, - kibnal_pathreq_callback, - &sd_params, conn); - if (frc == FPENDING) - return; - - CERROR ("Path record request failed: %d\n", frc); - kibnal_connreq_done (conn, 1, -EINVAL); -} - -static void -kibnal_connect_peer (kib_peer_t *peer) -{ - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY query; - FSTATUS frc; - kib_conn_t *conn = kibnal_create_conn(); - - LASSERT (peer->ibp_connecting != 0); - - if (conn == NULL) { - CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); - return; - } - - conn->ibc_peer = peer; - kib_peer_addref(peer); - - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done (conn, 1, -ENOMEM); - return; - } - - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - - kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); - - memset(&query, 0, sizeof(query)); - query.InputType = InputTypeServiceRecord; - query.OutputType = OutputTypeServiceRecord; - query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; - query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; - - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ - - /* kibnal_service_get_callback gets my conn ref */ - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &query, - kibnal_service_get_callback, - &sd_params, conn); - if (frc == FPENDING) - return; - - CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); - kibnal_connreq_done (conn, 1, frc); -} - -static int -kibnal_conn_timed_out (kib_conn_t *conn) -{ - kib_tx_t *tx; - struct list_head *ttmp; - unsigned long flags; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_tx_queue) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; - } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; - } - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - return 0; -} - -static void -kibnal_check_conns (int idx) -{ - struct list_head *peers = &kibnal_data.kib_peers[idx]; - struct list_head *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - struct list_head *ctmp; - - again: - /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we - * take a look... */ - read_lock (&kibnal_data.kib_global_lock); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - - list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); - - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kibnal_check_sends(conn); - - if (!kibnal_conn_timed_out(conn)) - continue; - - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - atomic_inc (&conn->ibc_refcount); - read_unlock (&kibnal_data.kib_global_lock); - - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); - - kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); - - /* start again now I've dropped the lock */ - goto again; - } - } - - read_unlock (&kibnal_data.kib_global_lock); -} - -static void -kib_connd_handle_state(kib_conn_t *conn) -{ - FSTATUS frc; - - switch (conn->ibc_state) { - /* all refs have gone, free and be done with it */ - case IBNAL_CONN_DISCONNECTED: - kibnal_destroy_conn (conn); - return; /* avoid put_conn */ - - case IBNAL_CONN_SEND_DREQ: - frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); - if (frc != FSUCCESS) /* XXX do real things */ - CERROR("disconnect failed: %d\n", frc); - conn->ibc_state = IBNAL_CONN_DREQ; - break; - - /* a callback got to the conn before we did */ - case IBNAL_CONN_DREP: - break; - - default: - CERROR ("Bad conn %p state: %d\n", conn, - conn->ibc_state); - LBUG(); - break; - } - - /* drop ref from close_conn */ - kibnal_put_conn(conn); -} - -int -kibnal_connd (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kib_conn_t *conn; - kib_peer_t *peer; - int timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - kportal_daemonize ("kibnal_connd"); - kportal_blockallsigs (); - - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - for (;;) { - if (!list_empty (&kibnal_data.kib_connd_conns)) { - conn = list_entry (kibnal_data.kib_connd_conns.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - kib_connd_handle_state(conn); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - continue; - } - - if (!list_empty (&kibnal_data.kib_connd_peers)) { - peer = list_entry (kibnal_data.kib_connd_peers.next, - kib_peer_t, ibp_connd_list); - - list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_connect_peer (peer); - kib_peer_decref (peer); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - /* shut down and nobody left to reap... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { - const int n = 4; - const int p = 1; - int chunk = kibnal_data.kib_peer_hash_size; - - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ - - if (kibnal_tunables.kib_io_timeout > n * p) - chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); - peer_index = (peer_index + 1) % - kibnal_data.kib_peer_hash_size; - } - - deadline += p * HZ; - } - - kibnal_data.kib_connd_waketime = jiffies + timeout; - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - if (!kibnal_data.kib_shutdown && - list_empty (&kibnal_data.kib_connd_conns) && - list_empty (&kibnal_data.kib_connd_peers)) - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_thread_fini (); - return (0); -} - -int -kibnal_scheduler(void *arg) -{ - long id = (long)arg; - char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; - - snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - for (;;) { - did_something = 0; - - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - kibnal_tx_done(tx); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - kibnal_rx(rx); - - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - /* shut down and no receives to complete... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - (kibnal_data.kib_shutdown && - atomic_read (&kibnal_data.kib_nconns) == 0)); - } else { - our_cond_resched(); - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - } - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - - kibnal_thread_fini(); - return (0); -} - - -lib_nal_t kibnal_lib = { - libnal_data: &kibnal_data, /* NAL private data */ - libnal_send: kibnal_send, - libnal_send_pages: kibnal_send_pages, - libnal_recv: kibnal_recv, - libnal_recv_pages: kibnal_recv_pages, - libnal_dist: kibnal_dist -}; diff --git a/lustre/portals/knals/lonal/.cvsignore b/lustre/portals/knals/lonal/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/knals/lonal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/lonal/Makefile.in b/lustre/portals/knals/lonal/Makefile.in deleted file mode 100644 index 222e861..0000000 --- a/lustre/portals/knals/lonal/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := klonal -klonal-objs := lonal.o lonal_cb.o - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/lonal/autoMakefile.am b/lustre/portals/knals/lonal/autoMakefile.am deleted file mode 100644 index d1ef995..0000000 --- a/lustre/portals/knals/lonal/autoMakefile.am +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -modulenet_DATA = klonal$(KMODEXT) -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(klonal-objs:%.o=%.c) lonal.h diff --git a/lustre/portals/knals/lonal/lonal.c b/lustre/portals/knals/lonal/lonal.c deleted file mode 100644 index 03c2742..0000000 --- a/lustre/portals/knals/lonal/lonal.c +++ /dev/null @@ -1,164 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "lonal.h" - -nal_t klonal_api; -klonal_data_t klonal_data; -ptl_handle_ni_t klonal_ni; - - -int -klonal_cmd (struct portals_cfg *pcfg, void *private) -{ - LASSERT (pcfg != NULL); - - switch (pcfg->pcfg_command) { - case NAL_CMD_REGISTER_MYNID: - CDEBUG (D_IOCTL, "setting NID to "LPX64" (was "LPX64")\n", - pcfg->pcfg_nid, klonal_lib.libnal_ni.ni_pid.nid); - klonal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; - return (0); - - default: - return (-EINVAL); - } -} - -static void -klonal_shutdown(nal_t *nal) -{ - /* NB The first ref was this module! */ - if (nal->nal_refct != 0) - return; - - CDEBUG (D_NET, "shutdown\n"); - LASSERT (nal == &klonal_api); - - switch (klonal_data.klo_init) - { - default: - LASSERT (0); - - case KLO_INIT_ALL: - libcfs_nal_cmd_unregister(LONAL); - /* fall through */ - - case KLO_INIT_LIB: - lib_fini (&klonal_lib); - break; - - case KLO_INIT_NOTHING: - return; - } - - memset(&klonal_data, 0, sizeof (klonal_data)); - - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); - - printk (KERN_INFO "Lustre: LO NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); - PORTAL_MODULE_UNUSE; -} - -static int -klonal_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - int rc; - ptl_process_id_t my_process_id; - int pkmem = atomic_read(&portal_kmemory); - - LASSERT (nal == &klonal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = klonal_lib.libnal_ni.ni_actual_limits; - return (PTL_OK); - } - - LASSERT (klonal_data.klo_init == KLO_INIT_NOTHING); - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - - /* ensure all pointers NULL etc */ - memset (&klonal_data, 0, sizeof (klonal_data)); - - my_process_id.nid = 0; - my_process_id.pid = requested_pid; - - rc = lib_init(&klonal_lib, nal, my_process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR ("lib_init failed %d\n", rc); - klonal_shutdown (nal); - return (rc); - } - - klonal_data.klo_init = KLO_INIT_LIB; - - rc = libcfs_nal_cmd_register (LONAL, &klonal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - klonal_shutdown (nal); - return (PTL_FAIL); - } - - klonal_data.klo_init = KLO_INIT_ALL; - - printk(KERN_INFO "Lustre: LO NAL (initial mem %d)\n", pkmem); - PORTAL_MODULE_USE; - - return (PTL_OK); -} - -void __exit -klonal_finalise (void) -{ - PtlNIFini(klonal_ni); - - ptl_unregister_nal(LONAL); -} - -static int __init -klonal_initialise (void) -{ - int rc; - - klonal_api.nal_ni_init = klonal_startup; - klonal_api.nal_ni_fini = klonal_shutdown; - - rc = ptl_register_nal(LONAL, &klonal_api); - if (rc != PTL_OK) { - CERROR("Can't register LONAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Loopback NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init (klonal_initialise); -module_exit (klonal_finalise); diff --git a/lustre/portals/knals/lonal/lonal.h b/lustre/portals/knals/lonal/lonal.h deleted file mode 100644 index 9d3d3ff..0000000 --- a/lustre/portals/knals/lonal/lonal.h +++ /dev/null @@ -1,74 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef _LONAL_H -#define _LONAL_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include - -#define KLOD_IOV 153401 -#define KLOD_KIOV 153402 - -typedef struct -{ - unsigned int klod_type; - unsigned int klod_niov; - size_t klod_offset; - size_t klod_nob; - union { - struct iovec *iov; - ptl_kiov_t *kiov; - } klod_iov; -} klo_desc_t; - -typedef struct -{ - char klo_init; /* what's been initialised */ -} klonal_data_t; - -/* kqn_init state */ -#define KLO_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KLO_INIT_LIB 1 -#define KLO_INIT_ALL 2 - -extern lib_nal_t klonal_lib; -extern nal_t klonal_api; -extern klonal_data_t klonal_data; - -#endif /* _LONAL_H */ diff --git a/lustre/portals/knals/lonal/lonal_cb.c b/lustre/portals/knals/lonal/lonal_cb.c deleted file mode 100644 index cf5df0d..0000000 --- a/lustre/portals/knals/lonal/lonal_cb.c +++ /dev/null @@ -1,267 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "lonal.h" - -/* - * LIB functions follow - * - */ -static int -klonal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - *dist = 0; /* it's me */ - return (0); -} - -static ptl_err_t -klonal_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_offset, - size_t payload_nob) -{ - klo_desc_t klod = { - .klod_type = KLOD_IOV, - .klod_niov = payload_niov, - .klod_offset = payload_offset, - .klod_nob = payload_nob, - .klod_iov = { .iov = payload_iov } }; - ptl_err_t rc; - - LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); - - rc = lib_parse(&klonal_lib, hdr, &klod); - if (rc == PTL_OK) - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - - return rc; -} - -static ptl_err_t -klonal_send_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - klo_desc_t klod = { - .klod_type = KLOD_KIOV, - .klod_niov = payload_niov, - .klod_offset = payload_offset, - .klod_nob = payload_nob, - .klod_iov = { .kiov = payload_kiov } }; - ptl_err_t rc; - - LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); - - rc = lib_parse(&klonal_lib, hdr, &klod); - if (rc == PTL_OK) - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - - return rc; -} - -static ptl_err_t -klonal_recv(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) -{ - klo_desc_t *klod = (klo_desc_t *)private; - - /* I only handle mapped->mapped matches */ - LASSERT(klod->klod_type == KLOD_IOV); - - if (mlen == 0) - return PTL_OK; - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT(niov > 0); - } - - while (klod->klod_offset >= klod->klod_iov.iov->iov_len) { - klod->klod_offset -= klod->klod_iov.iov->iov_len; - klod->klod_iov.iov++; - klod->klod_niov--; - LASSERT(klod->klod_niov > 0); - } - - do { - int fraglen = MIN(iov->iov_len - offset, - klod->klod_iov.iov->iov_len - klod->klod_offset); - - LASSERT(niov > 0); - LASSERT(klod->klod_niov > 0); - - if (fraglen > mlen) - fraglen = mlen; - - memcpy((void *)((unsigned long)iov->iov_base + offset), - (void *)((unsigned long)klod->klod_iov.iov->iov_base + - klod->klod_offset), - fraglen); - - if (offset + fraglen < iov->iov_len) { - offset += fraglen; - } else { - offset = 0; - iov++; - niov--; - } - - if (klod->klod_offset + fraglen < klod->klod_iov.iov->iov_len ) { - klod->klod_offset += fraglen; - } else { - klod->klod_offset = 0; - klod->klod_iov.iov++; - klod->klod_niov--; - } - - mlen -= fraglen; - } while (mlen > 0); - - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - return PTL_OK; -} - -static ptl_err_t -klonal_recv_pages(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) -{ - void *srcaddr = NULL; - void *dstaddr = NULL; - unsigned long srcfrag = 0; - unsigned long dstfrag = 0; - unsigned long fraglen; - klo_desc_t *klod = (klo_desc_t *)private; - - /* I only handle unmapped->unmapped matches */ - LASSERT(klod->klod_type == KLOD_KIOV); - - if (mlen == 0) - return PTL_OK; - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT(niov > 0); - } - - while (klod->klod_offset >= klod->klod_iov.kiov->kiov_len) { - klod->klod_offset -= klod->klod_iov.kiov->kiov_len; - klod->klod_iov.kiov++; - klod->klod_niov--; - LASSERT(klod->klod_niov > 0); - } - - do { - /* CAVEAT EMPTOR: I kmap 2 pages at once == slight risk of deadlock */ - LASSERT(niov > 0); - if (dstaddr == NULL) { - dstaddr = (void *)((unsigned long)kmap(kiov->kiov_page) + - kiov->kiov_offset + offset); - dstfrag = kiov->kiov_len - offset; - } - - LASSERT(klod->klod_niov > 0); - if (srcaddr == NULL) { - srcaddr = (void *)((unsigned long)kmap(klod->klod_iov.kiov->kiov_page) + - klod->klod_iov.kiov->kiov_offset + klod->klod_offset); - srcfrag = klod->klod_iov.kiov->kiov_len - klod->klod_offset; - } - - fraglen = MIN(srcfrag, dstfrag); - if (fraglen > mlen) - fraglen = mlen; - - memcpy(dstaddr, srcaddr, fraglen); - - if (fraglen < dstfrag) { - dstfrag -= fraglen; - dstaddr = (void *)((unsigned long)dstaddr + fraglen); - } else { - kunmap(kiov->kiov_page); - dstaddr = NULL; - offset = 0; - kiov++; - niov--; - } - - if (fraglen < srcfrag) { - srcfrag -= fraglen; - srcaddr = (void *)((unsigned long)srcaddr + fraglen); - } else { - kunmap(klod->klod_iov.kiov->kiov_page); - srcaddr = NULL; - klod->klod_offset = 0; - klod->klod_iov.kiov++; - klod->klod_niov--; - } - - mlen -= fraglen; - } while (mlen > 0); - - if (dstaddr != NULL) - kunmap(kiov->kiov_page); - - if (srcaddr != NULL) - kunmap(klod->klod_iov.kiov->kiov_page); - - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - return PTL_OK; -} - -lib_nal_t klonal_lib = -{ - libnal_data: &klonal_data, /* NAL private data */ - libnal_send: klonal_send, - libnal_send_pages: klonal_send_pages, - libnal_recv: klonal_recv, - libnal_recv_pages: klonal_recv_pages, - libnal_dist: klonal_dist -}; diff --git a/lustre/portals/knals/openibnal/.cvsignore b/lustre/portals/knals/openibnal/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/knals/openibnal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/openibnal/Makefile.in b/lustre/portals/knals/openibnal/Makefile.in deleted file mode 100644 index 9b8ed5d..0000000 --- a/lustre/portals/knals/openibnal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kopenibnal -kopenibnal-objs := openibnal.o openibnal_cb.o - -EXTRA_POST_CFLAGS := @OPENIBCPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/openibnal/Makefile.mk b/lustre/portals/knals/openibnal/Makefile.mk deleted file mode 100644 index bd8043e..0000000 --- a/lustre/portals/knals/openibnal/Makefile.mk +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../../Kernelenv - -obj-y += kopenibnal.o -kopenibnal-objs := openibnal.o openibnal_cb.o - diff --git a/lustre/portals/knals/openibnal/autoMakefile.am b/lustre/portals/knals/openibnal/autoMakefile.am deleted file mode 100644 index a4207ae..0000000 --- a/lustre/portals/knals/openibnal/autoMakefile.am +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -if BUILD_OPENIBNAL -modulenet_DATA = kopenibnal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kopenibnal-objs:%.o=%.c) openibnal.h diff --git a/lustre/portals/knals/openibnal/openibnal.c b/lustre/portals/knals/openibnal/openibnal.c deleted file mode 100644 index 652eb34..0000000 --- a/lustre/portals/knals/openibnal/openibnal.c +++ /dev/null @@ -1,1486 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "openibnal.h" - -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_data_t kibnal_data; -kib_tunables_t kibnal_tunables; - -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 - -#define IBNAL_SYSCTL_TIMEOUT 1 - -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif - -void -print_service(struct ib_common_attrib_service *service, char *tag, int rc) -{ - char name[32]; - - if (service == NULL) - { - CWARN("tag : %s\n" - "status : %d (NULL)\n", tag, rc); - return; - } - strncpy (name, service->service_name, sizeof(name)-1); - name[sizeof(name)-1] = 0; - - CWARN("tag : %s\n" - "status : %d\n" - "service id: "LPX64"\n" - "name : %s\n" - "NID : "LPX64"\n", tag, rc, - service->service_id, name, - *kibnal_service_nid_field(service)); -} - -void -kibnal_service_setunset_done (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_common_attrib_service *service, void *arg) -{ - *(int *)arg = status; - up (&kibnal_data.kib_nid_signal); -} - -#if IBNAL_CHECK_ADVERT -void -kibnal_check_advert (void) -{ - struct ib_common_attrib_service *svc; - __u64 tid; - int rc; - int rc2; - - PORTAL_ALLOC(svc, sizeof(*svc)); - if (svc == NULL) - return; - - memset (svc, 0, sizeof (*svc)); - kibnal_set_service_keys(svc, kibnal_data.kib_nid); - - rc = ib_service_get (kibnal_data.kib_device, - kibnal_data.kib_port, - svc, - KIBNAL_SERVICE_KEY_MASK, - kibnal_tunables.kib_io_timeout * HZ, - kibnal_service_setunset_done, &rc2, - &tid); - - if (rc != 0) { - CERROR ("Immediate error %d checking SM service\n", rc); - } else { - down (&kibnal_data.kib_nid_signal); - rc = rc2; - - if (rc != 0) - CERROR ("Error %d checking SM service\n", rc); - } - - PORTAL_FREE(svc, sizeof(*svc)); -} -#endif - -int -kibnal_advertise (void) -{ - struct ib_common_attrib_service *svc; - __u64 tid; - int rc; - int rc2; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - - PORTAL_ALLOC(svc, sizeof(*svc)); - if (svc == NULL) - return (-ENOMEM); - - memset (svc, 0, sizeof (*svc)); - - svc->service_id = kibnal_data.kib_service_id; - - rc = ib_cached_gid_get(kibnal_data.kib_device, - kibnal_data.kib_port, - 0, - svc->service_gid); - if (rc != 0) { - CERROR ("Can't get port %d GID: %d\n", - kibnal_data.kib_port, rc); - goto out; - } - - rc = ib_cached_pkey_get(kibnal_data.kib_device, - kibnal_data.kib_port, - 0, - &svc->service_pkey); - if (rc != 0) { - CERROR ("Can't get port %d PKEY: %d\n", - kibnal_data.kib_port, rc); - goto out; - } - - svc->service_lease = 0xffffffff; - - kibnal_set_service_keys(svc, kibnal_data.kib_nid); - - CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - svc->service_id, - svc->service_name, *kibnal_service_nid_field(svc)); - - rc = ib_service_set (kibnal_data.kib_device, - kibnal_data.kib_port, - svc, - IB_SA_SERVICE_COMP_MASK_ID | - IB_SA_SERVICE_COMP_MASK_GID | - IB_SA_SERVICE_COMP_MASK_PKEY | - IB_SA_SERVICE_COMP_MASK_LEASE | - KIBNAL_SERVICE_KEY_MASK, - kibnal_tunables.kib_io_timeout * HZ, - kibnal_service_setunset_done, &rc2, &tid); - - if (rc != 0) { - CERROR ("Immediate error %d advertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); - goto out; - } - - down (&kibnal_data.kib_nid_signal); - - rc = rc2; - if (rc != 0) - CERROR ("Error %d advertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); - out: - PORTAL_FREE(svc, sizeof(*svc)); - return (rc); -} - -void -kibnal_unadvertise (int expect_success) -{ - struct ib_common_attrib_service *svc; - __u64 tid; - int rc; - int rc2; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); - - PORTAL_ALLOC(svc, sizeof(*svc)); - if (svc == NULL) - return; - - memset (svc, 0, sizeof(*svc)); - - kibnal_set_service_keys(svc, kibnal_data.kib_nid); - - CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - svc->service_name, *kibnal_service_nid_field(svc)); - - rc = ib_service_delete (kibnal_data.kib_device, - kibnal_data.kib_port, - svc, - KIBNAL_SERVICE_KEY_MASK, - kibnal_tunables.kib_io_timeout * HZ, - kibnal_service_setunset_done, &rc2, &tid); - if (rc != 0) { - CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); - goto out; - } - - down (&kibnal_data.kib_nid_signal); - - if ((rc2 == 0) == !!expect_success) - goto out; /* success: rc == 0 */ - - if (expect_success) - CERROR("Error %d unadvertising NID "LPX64"\n", - rc, kibnal_data.kib_nid); - else - CWARN("Removed conflicting NID "LPX64"\n", - kibnal_data.kib_nid); - out: - PORTAL_FREE(svc, sizeof(*svc)); -} - -int -kibnal_set_mynid(ptl_nid_t nid) -{ - struct timeval tv; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - do_gettimeofday(&tv); - - down (&kibnal_data.kib_nid_mutex); - - if (nid == kibnal_data.kib_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); - - if (kibnal_data.kib_nid != PTL_NID_ANY) { - - kibnal_unadvertise (1); - - rc = ib_cm_listen_stop (kibnal_data.kib_listen_handle); - if (rc != 0) - CERROR ("Error %d stopping listener\n", rc); - } - - kibnal_data.kib_nid = ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (kibnal_data.kib_nid == PTL_NID_ANY) { - /* No new NID to install */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - /* remove any previous advert (crashed node etc) */ - kibnal_unadvertise(0); - - /* Assign new service number */ - kibnal_data.kib_service_id = ib_cm_service_assign(); - CDEBUG(D_NET, "service_id "LPX64"\n", kibnal_data.kib_service_id); - - rc = ib_cm_listen(kibnal_data.kib_service_id, - TS_IB_CM_SERVICE_EXACT_MASK, - kibnal_passive_conn_callback, NULL, - &kibnal_data.kib_listen_handle); - if (rc == 0) { - rc = kibnal_advertise(); - if (rc == 0) { -#if IBNAL_CHECK_ADVERT - kibnal_check_advert(); -#endif - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - ib_cm_listen_stop(kibnal_data.kib_listen_handle); - /* remove any peers that sprung up while I failed to - * advertise myself */ - kibnal_del_peer (PTL_NID_ANY, 0); - } - - kibnal_data.kib_nid = PTL_NID_ANY; - up (&kibnal_data.kib_nid_mutex); - return (rc); -} - -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - LASSERT (nid != PTL_NID_ANY); - - PORTAL_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) - return (NULL); - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->ibp_nid = nid; - atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD (&peer->ibp_conns); - INIT_LIST_HEAD (&peer->ibp_tx_queue); - - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - - atomic_inc (&kibnal_data.kib_npeers); - return (peer); -} - -void -kibnal_destroy_peer (kib_peer_t *peer) -{ - CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer); - - LASSERT (atomic_read (&peer->ibp_refcount) == 0); - LASSERT (peer->ibp_persistence == 0); - LASSERT (!kibnal_peer_active(peer)); - LASSERT (peer->ibp_connecting == 0); - LASSERT (list_empty (&peer->ibp_conns)); - LASSERT (list_empty (&peer->ibp_tx_queue)); - - PORTAL_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec (&kibnal_data.kib_npeers); -} - -void -kibnal_put_peer (kib_peer_t *peer) -{ - CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", - peer, peer->ibp_nid, - atomic_read (&peer->ibp_refcount)); - - LASSERT (atomic_read (&peer->ibp_refcount) > 0); - if (!atomic_dec_and_test (&peer->ibp_refcount)) - return; - - kibnal_destroy_peer (peer); -} - -kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) -{ - struct list_head *peer_list = kibnal_nid2peerlist (nid); - struct list_head *tmp; - kib_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, kib_peer_t, ibp_list); - - LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ - peer->ibp_connecting != 0 || /* creating conns */ - !list_empty (&peer->ibp_conns)); /* active conn */ - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); - return (peer); - } - return (NULL); -} - -kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - read_lock (&kibnal_data.kib_global_lock); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - atomic_inc (&peer->ibp_refcount); - read_unlock (&kibnal_data.kib_global_lock); - - return (peer); -} - -void -kibnal_unlink_peer_locked (kib_peer_t *peer) -{ - LASSERT (peer->ibp_persistence == 0); - LASSERT (list_empty(&peer->ibp_conns)); - - LASSERT (kibnal_peer_active(peer)); - list_del_init (&peer->ibp_list); - /* lose peerlist's ref */ - kibnal_put_peer (peer); -} - -int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) -{ - kib_peer_t *peer; - struct list_head *ptmp; - int i; - - read_lock (&kibnal_data.kib_global_lock); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *persistencep = peer->ibp_persistence; - - read_unlock (&kibnal_data.kib_global_lock); - return (0); - } - } - - read_unlock (&kibnal_data.kib_global_lock); - return (-ENOENT); -} - -int -kibnal_add_persistent_peer (ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_peer_t *peer2; - - if (nid == PTL_NID_ANY) - return (-EINVAL); - - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - peer2 = kibnal_find_peer_locked (nid); - if (peer2 != NULL) { - kibnal_put_peer (peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist (nid)); - } - - peer->ibp_persistence++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (0); -} - -void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kib_conn_t *conn; - - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; - - if (peer->ibp_persistence != 0) - return; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, kib_conn_t, ibc_list); - - kibnal_close_conn_locked (conn, 0); - } - - /* NB peer unlinks itself when last conn is closed */ -} - -int -kibnal_del_peer (ptl_nid_t nid, int single_share) -{ - unsigned long flags; - struct list_head *ptmp; - struct list_head *pnxt; - kib_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) - continue; - - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ - - if (single_share) - goto out; - } - } - out: - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - return (rc); -} - -kib_conn_t * -kibnal_get_conn_by_idx (int index) -{ - kib_peer_t *peer; - struct list_head *ptmp; - kib_conn_t *conn; - struct list_head *ctmp; - int i; - - read_lock (&kibnal_data.kib_global_lock); - - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - list_for_each (ptmp, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence > 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - list_for_each (ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - read_unlock (&kibnal_data.kib_global_lock); - return (conn); - } - } - } - - read_unlock (&kibnal_data.kib_global_lock); - return (NULL); -} - -kib_conn_t * -kibnal_create_conn (void) -{ - kib_conn_t *conn; - int i; - __u64 vaddr = 0; - __u64 vaddr_base; - int page_offset; - int ipage; - int rc; - union { - struct ib_qp_create_param qp_create; - struct ib_qp_attribute qp_attr; - } params; - - PORTAL_ALLOC (conn, sizeof (*conn)); - if (conn == NULL) { - CERROR ("Can't allocate connection\n"); - return (NULL); - } - - /* zero flags, NULL pointers etc... */ - memset (conn, 0, sizeof (*conn)); - - INIT_LIST_HEAD (&conn->ibc_tx_queue); - INIT_LIST_HEAD (&conn->ibc_active_txs); - spin_lock_init (&conn->ibc_lock); - - atomic_inc (&kibnal_data.kib_nconns); - /* well not really, but I call destroy() on failure, which decrements */ - - PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) - goto failed; - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, - IBNAL_RX_MSG_PAGES, - IB_ACCESS_LOCAL_WRITE); - if (rc != 0) - goto failed; - - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_vaddr = vaddr; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_RX_MSG_PAGES); - } - } - - params.qp_create = (struct ib_qp_create_param) { - .limit = { - /* Sends have an optional RDMA */ - .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE, - .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE, - .max_send_gather_element = 1, - .max_receive_scatter_element = 1, - }, - .pd = kibnal_data.kib_pd, - .send_queue = kibnal_data.kib_cq, - .receive_queue = kibnal_data.kib_cq, - .send_policy = IB_WQ_SIGNAL_SELECTABLE, - .receive_policy = IB_WQ_SIGNAL_SELECTABLE, - .rd_domain = 0, - .transport = IB_TRANSPORT_RC, - .device_specific = NULL, - }; - - rc = ib_qp_create (¶ms.qp_create, &conn->ibc_qp, &conn->ibc_qpn); - if (rc != 0) { - CERROR ("Failed to create queue pair: %d\n", rc); - goto failed; - } - - /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - - params.qp_attr = (struct ib_qp_attribute) { - .state = IB_QP_STATE_INIT, - .port = kibnal_data.kib_port, - .enable_rdma_read = 1, - .enable_rdma_write = 1, - .valid_fields = (IB_QP_ATTRIBUTE_STATE | - IB_QP_ATTRIBUTE_PORT | - IB_QP_ATTRIBUTE_PKEY_INDEX | - IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE), - }; - rc = ib_qp_modify(conn->ibc_qp, ¶ms.qp_attr); - if (rc != 0) { - CERROR ("Failed to modify queue pair: %d\n", rc); - goto failed; - } - - /* 1 ref for caller */ - atomic_set (&conn->ibc_refcount, 1); - return (conn); - - failed: - kibnal_destroy_conn (conn); - return (NULL); -} - -void -kibnal_destroy_conn (kib_conn_t *conn) -{ - int rc; - - CDEBUG (D_NET, "connection %p\n", conn); - - LASSERT (atomic_read (&conn->ibc_refcount) == 0); - LASSERT (list_empty(&conn->ibc_tx_queue)); - LASSERT (list_empty(&conn->ibc_active_txs)); - LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); - - switch (conn->ibc_state) { - case IBNAL_CONN_ZOMBIE: - /* called after connection sequence initiated */ - - case IBNAL_CONN_INIT_QP: - rc = ib_qp_destroy(conn->ibc_qp); - if (rc != 0) - CERROR("Can't destroy QP: %d\n", rc); - /* fall through */ - - case IBNAL_CONN_INIT_NOTHING: - break; - - default: - LASSERT (0); - } - - if (conn->ibc_rx_pages != NULL) - kibnal_free_pages(conn->ibc_rx_pages); - - if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, - IBNAL_RX_MSGS * sizeof(kib_rx_t)); - - if (conn->ibc_peer != NULL) - kibnal_put_peer(conn->ibc_peer); - - PORTAL_FREE(conn, sizeof (*conn)); - - atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_connd_waitq); - } -} - -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* last ref only goes on zombies */ - LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); -} - -int -kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - count++; - kibnal_close_conn_locked (conn, why); - } - - return (count); -} - -int -kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) -{ - kib_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - if (conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->ibp_nid, conn->ibc_incarnation, incarnation); - - count++; - kibnal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -kibnal_close_matching_conns (ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; - else { - lo = 0; - hi = kibnal_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { - - peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || - !list_empty (&peer->ibp_conns)); - - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kibnal_close_peer_conns_locked (peer, 0); - } - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == PTL_NID_ANY) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -int -kibnal_cmd(struct portals_cfg *pcfg, void * private) -{ - int rc = -EINVAL; - - LASSERT (pcfg != NULL); - - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - int share_count = 0; - - rc = kibnal_get_peer_info(pcfg->pcfg_count, - &nid, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); - break; - } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); - break; - } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); - } - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); - break; - } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) - rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); - break; - } - } - - return rc; -} - -void -kibnal_free_pages (kib_pages_t *p) -{ - int npages = p->ibp_npages; - int rc; - int i; - - if (p->ibp_mapped) { - rc = ib_memory_deregister(p->ibp_handle); - if (rc != 0) - CERROR ("Deregister error: %d\n", rc); - } - - for (i = 0; i < npages; i++) - if (p->ibp_pages[i] != NULL) - __free_page(p->ibp_pages[i]); - - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); -} - -int -kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) -{ - kib_pages_t *p; - struct ib_physical_buffer *phys_pages; - int i; - int rc; - - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); - if (p == NULL) { - CERROR ("Can't allocate buffer %d\n", npages); - return (-ENOMEM); - } - - memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_page (GFP_KERNEL); - if (p->ibp_pages[i] == NULL) { - CERROR ("Can't allocate page %d of %d\n", i, npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - } - - PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); - if (phys_pages == NULL) { - CERROR ("Can't allocate physarray for %d pages\n", npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - - for (i = 0; i < npages; i++) { - phys_pages[i].size = PAGE_SIZE; - phys_pages[i].address = - kibnal_page2phys(p->ibp_pages[i]); - } - - p->ibp_vaddr = 0; - rc = ib_memory_register_physical(kibnal_data.kib_pd, - phys_pages, npages, - &p->ibp_vaddr, - npages * PAGE_SIZE, 0, - access, - &p->ibp_handle, - &p->ibp_lkey, - &p->ibp_rkey); - - PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); - - if (rc != 0) { - CERROR ("Error %d mapping %d pages\n", rc, npages); - kibnal_free_pages(p); - return (rc); - } - - p->ibp_mapped = 1; - *pp = p; - return (0); -} - -int -kibnal_setup_tx_descs (void) -{ - int ipage = 0; - int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; - struct page *page; - kib_tx_t *tx; - int i; - int rc; - - /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, - IBNAL_TX_MSG_PAGES, - 0); /* local read access only */ - if (rc != 0) - return (rc); - - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - - for (i = 0; i < IBNAL_TX_MSGS; i++) { - page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; - tx = &kibnal_data.kib_tx_descs[i]; - - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= IBNAL_NTX); - tx->tx_mapped = KIB_TX_UNMAPPED; - - CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", - i, tx, tx->tx_msg, tx->tx_vaddr); - - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - - page_offset += IBNAL_MSG_SIZE; - LASSERT (page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); - } - } - - return (0); -} - -void -kibnal_api_shutdown (nal_t *nal) -{ - int i; - int rc; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &kibnal_api); - - switch (kibnal_data.kib_init) { - default: - CERROR ("Unexpected state %d\n", kibnal_data.kib_init); - LBUG(); - - case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(OPENIBNAL); - /* No new peers */ - - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kibnal_set_mynid (PTL_NID_ANY); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to close down\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_CQ: - rc = ib_cq_destroy (kibnal_data.kib_cq); - if (rc != 0) - CERROR ("Destroy CQ error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_TXD: - kibnal_free_pages (kibnal_data.kib_tx_pages); - /* fall through */ -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif - case IBNAL_INIT_PD: - rc = ib_pd_destroy(kibnal_data.kib_pd); - if (rc != 0) - CERROR ("Destroy PD error: %d\n", rc); - /* fall through */ - - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); - /* fall through */ - - case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); - LASSERT (kibnal_data.kib_peers != NULL); - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { - LASSERT (list_empty (&kibnal_data.kib_peers[i])); - } - LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); - LASSERT (list_empty (&kibnal_data.kib_connd_conns)); - LASSERT (list_empty (&kibnal_data.kib_connd_peers)); - - /* flag threads to terminate; wake and wait for them to die */ - kibnal_data.kib_shutdown = 1; - wake_up_all (&kibnal_data.kib_sched_waitq); - wake_up_all (&kibnal_data.kib_connd_waitq); - - i = 2; - while (atomic_read (&kibnal_data.kib_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); - } - /* fall through */ - - case IBNAL_INIT_NOTHING: - break; - } - - if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - - if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * - kibnal_data.kib_peer_hash_size); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); - - kibnal_data.kib_init = IBNAL_INIT_NOTHING; -} - -int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; - - LASSERT (nal == &kibnal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } - - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); - - memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ - - init_MUTEX (&kibnal_data.kib_nid_mutex); - init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); - kibnal_data.kib_nid = PTL_NID_ANY; - - rwlock_init(&kibnal_data.kib_global_lock); - - kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, - sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); - if (kibnal_data.kib_peers == NULL) { - goto failed; - } - for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); - - spin_lock_init (&kibnal_data.kib_connd_lock); - INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); - INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); - init_waitqueue_head (&kibnal_data.kib_connd_waitq); - - spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); - init_waitqueue_head (&kibnal_data.kib_sched_waitq); - - spin_lock_init (&kibnal_data.kib_tx_lock); - INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); - goto failed; - } - - /* lists/ptrs/locks initialised */ - kibnal_data.kib_init = IBNAL_INIT_DATA; - /*****************************************************/ - - - process_id.pid = requested_pid; - process_id.nid = kibnal_data.kib_nid; - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ - - for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)i); - if (rc != 0) { - CERROR("Can't spawn openibnal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - rc = kibnal_thread_start (kibnal_connd, NULL); - if (rc != 0) { - CERROR ("Can't spawn openibnal connd: %d\n", rc); - goto failed; - } - - kibnal_data.kib_device = ib_device_get_by_index(0); - if (kibnal_data.kib_device == NULL) { - CERROR ("Can't open ib device 0\n"); - goto failed; - } - - rc = ib_device_properties_get(kibnal_data.kib_device, - &kibnal_data.kib_device_props); - if (rc != 0) { - CERROR ("Can't get device props: %d\n", rc); - goto failed; - } - - CDEBUG(D_NET, "Max Initiator: %d Max Responder %d\n", - kibnal_data.kib_device_props.max_initiator_per_qp, - kibnal_data.kib_device_props.max_responder_per_qp); - - kibnal_data.kib_port = 0; - for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(kibnal_data.kib_device, i, - &kibnal_data.kib_port_props); - if (rc == 0) { - kibnal_data.kib_port = i; - break; - } - } - if (kibnal_data.kib_port == 0) { - CERROR ("Can't find a port\n"); - goto failed; - } - - rc = ib_pd_create(kibnal_data.kib_device, - NULL, &kibnal_data.kib_pd); - if (rc != 0) { - CERROR ("Can't create PD: %d\n", rc); - goto failed; - } - - /* flag PD initialised */ - kibnal_data.kib_init = IBNAL_INIT_PD; - /*****************************************************/ -#if IBNAL_FMR - { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - /*****************************************************/ - - rc = kibnal_setup_tx_descs(); - if (rc != 0) { - CERROR ("Can't register tx descs: %d\n", rc); - goto failed; - } - - /* flag TX descs initialised */ - kibnal_data.kib_init = IBNAL_INIT_TXD; - /*****************************************************/ - - { - struct ib_cq_callback callback = { - .context = IBNAL_CALLBACK_CTXT, - .policy = IB_CQ_PROVIDER_REARM, - .function = { - .entry = kibnal_callback, - }, - .arg = NULL, - }; - int nentries = IBNAL_CQ_ENTRIES; - - rc = ib_cq_create (kibnal_data.kib_device, - &nentries, &callback, NULL, - &kibnal_data.kib_cq); - if (rc != 0) { - CERROR ("Can't create CQ: %d\n", rc); - goto failed; - } - - /* I only want solicited events */ - rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); - LASSERT (rc == 0); - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; - /*****************************************************/ - - rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - goto failed; - } - - /* flag everything initialised */ - kibnal_data.kib_init = IBNAL_INIT_ALL; - /*****************************************************/ - - printk(KERN_INFO "Lustre: OpenIB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); - - failed: - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); -} - -void __exit -kibnal_module_fini (void) -{ -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(OPENIBNAL); -} - -int __init -kibnal_module_init (void) -{ - int rc; - - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - - rc = ptl_register_nal(OPENIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(OPENIBNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init(kibnal_module_init); -module_exit(kibnal_module_fini); - diff --git a/lustre/portals/knals/openibnal/openibnal.h b/lustre/portals/knals/openibnal/openibnal.h deleted file mode 100644 index 2fbd88b..0000000 --- a/lustre/portals/knals/openibnal/openibnal.h +++ /dev/null @@ -1,533 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include - -#include -#include -#include - -#define IBNAL_SERVICE_NAME "openibnal" - -#if CONFIG_SMP -# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define IBNAL_N_SCHED 1 /* # schedulers */ -#endif - -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define IBNAL_RETRY 7 /* # times to retry */ -#define IBNAL_RNR_RETRY 7 /* */ -#define IBNAL_CM_RETRY 7 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_RESPONDER_RESOURCES 8 - -#define IBNAL_NTX 64 /* # tx descs */ -#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ - -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ - -/************************/ -/* derived constants... */ - -/* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -/* we may have up to 2 completions per transmit + - 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ - (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_FMR 1 -#define IBNAL_CKSUM 0 -//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT - -typedef struct -{ - int kib_io_timeout; /* comms timeout (seconds) */ - struct ctl_table_header *kib_sysctl; /* sysctl interface */ -} kib_tunables_t; - -typedef struct -{ - int ibp_npages; /* # pages */ - int ibp_mapped; /* mapped? */ - __u64 ibp_vaddr; /* mapped region vaddr */ - __u32 ibp_lkey; /* mapped region lkey */ - __u32 ibp_rkey; /* mapped region rkey */ - struct ib_mr *ibp_handle; /* mapped region handle */ - struct page *ibp_pages[0]; -} kib_pages_t; - -typedef struct -{ - int kib_init; /* initialisation state */ - __u64 kib_incarnation; /* which one am I */ - int kib_shutdown; /* shut down? */ - atomic_t kib_nthreads; /* # live threads */ - - __u64 kib_service_id; /* service number I listen on */ - ptl_nid_t kib_nid; /* my NID */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ - struct semaphore kib_nid_signal; /* signal completion */ - - rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - atomic_t kib_npeers; /* # peers extant */ - atomic_t kib_nconns; /* # connections extant */ - - struct list_head kib_connd_conns; /* connections to progress */ - struct list_head kib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ - unsigned long kib_connd_waketime; /* when connd will wake */ - spinlock_t kib_connd_lock; /* serialise */ - - wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - struct list_head kib_sched_txq; /* tx requiring attention */ - struct list_head kib_sched_rxq; /* rx requiring attention */ - spinlock_t kib_sched_lock; /* serialise */ - - struct kib_tx *kib_tx_descs; /* all the tx descriptors */ - kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ - - struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ - __u64 kib_next_tx_cookie; /* RDMA completion cookie */ - spinlock_t kib_tx_lock; /* serialise */ - - struct ib_device *kib_device; /* "the" device */ - struct ib_device_properties kib_device_props; /* its properties */ - int kib_port; /* port on the device */ - struct ib_port_properties kib_port_props; /* its properties */ - struct ib_pd *kib_pd; /* protection domain */ -#if IBNAL_FMR - struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */ -#endif - struct ib_cq *kib_cq; /* completion queue */ - void *kib_listen_handle; /* where I listen for connections */ - -} kib_data_t; - -#define IBNAL_INIT_NOTHING 0 -#define IBNAL_INIT_DATA 1 -#define IBNAL_INIT_LIB 2 -#define IBNAL_INIT_PD 3 -#define IBNAL_INIT_FMR 4 -#define IBNAL_INIT_TXD 5 -#define IBNAL_INIT_CQ 6 -#define IBNAL_INIT_ALL 7 - -/************************************************************************ - * Wire message structs. - * These are sent in sender's byte order (i.e. receiver flips). - * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD - * private data and SM service info), is LE on the wire. - */ - -typedef struct -{ - union { - struct ib_mr *mr; - struct ib_fmr *fmr; - } md_handle; - __u32 md_lkey; - __u32 md_rkey; - __u64 md_addr; -} kib_md_t; - -typedef struct -{ - __u32 rd_key; /* remote key */ - __u32 rd_nob; /* # of bytes */ - __u64 rd_addr; /* remote io vaddr */ -} kib_rdma_desc_t; - - -typedef struct -{ - ptl_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} kib_immediate_msg_t; - -typedef struct -{ - ptl_hdr_t ibrm_hdr; /* portals header */ - __u64 ibrm_cookie; /* opaque completion cookie */ - kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ -} kib_rdma_msg_t; - -typedef struct -{ - __u64 ibcm_cookie; /* opaque completion cookie */ - __u32 ibcm_status; /* completion status */ -} kib_completion_msg_t; - -typedef struct -{ - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ -#if IBNAL_CKSUM - __u32 ibm_nob; - __u32 ibm_cksum; -#endif - union { - kib_immediate_msg_t immediate; - kib_rdma_msg_t rdma; - kib_completion_msg_t completion; - } ibm_u; -} kib_msg_t; - -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 1 /* current protocol version */ - -#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ - -/***********************************************************************/ - -typedef struct kib_rx /* receive message */ -{ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_rdma; /* RDMA completion posted? */ - int rx_posted; /* posted? */ - __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - struct ib_receive_param rx_sp; /* receive work item */ - struct ib_gather_scatter rx_gl; /* and it's memory */ -} kib_rx_t; - -typedef struct kib_tx /* transmit message */ -{ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ - struct kib_conn *tx_conn; /* owning conn */ - int tx_mapped; /* mapped for RDMA? */ - int tx_sending; /* # tx callbacks outstanding */ - int tx_status; /* completion status */ - unsigned long tx_deadline; /* completion deadline */ - int tx_passive_rdma; /* peer sucks/blows */ - int tx_passive_rdma_wait; /* waiting for peer to complete */ - __u64 tx_passive_rdma_cookie; /* completion cookie */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - kib_md_t tx_md; /* RDMA mapping (active/passive) */ - __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ - kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ - int tx_nsp; /* # send work items */ - struct ib_send_param tx_sp[2]; /* send work items... */ - struct ib_gather_scatter tx_gl[2]; /* ...and their memory */ -} kib_tx_t; - -#define KIB_TX_UNMAPPED 0 -#define KIB_TX_MAPPED 1 -#define KIB_TX_MAPPED_FMR 2 - -typedef struct kib_wire_connreq -{ - __u32 wcr_magic; /* I'm an openibnal connreq */ - __u16 wcr_version; /* this is my version number */ - __u16 wcr_queue_depth; /* this is my receive queue size */ - __u64 wcr_nid; /* peer's NID */ - __u64 wcr_incarnation; /* peer's incarnation */ -} kib_wire_connreq_t; - -typedef struct kib_connreq -{ - /* connection-in-progress */ - struct kib_conn *cr_conn; - kib_wire_connreq_t cr_wcr; - __u64 cr_tid; - struct ib_common_attrib_service cr_service; - tTS_IB_GID cr_gid; - struct ib_path_record cr_path; - struct ib_cm_active_param cr_connparam; -} kib_connreq_t; - -typedef struct kib_conn -{ - struct kib_peer *ibc_peer; /* owning peer */ - struct list_head ibc_list; /* stash on peer's conn list */ - __u64 ibc_incarnation; /* which instance of the peer */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - atomic_t ibc_nob; /* # bytes buffered */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - struct list_head ibc_tx_queue; /* send queue */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - kib_rx_t *ibc_rxs; /* the rx descs */ - kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - struct ib_qp *ibc_qp; /* queue pair */ - __u32 ibc_qpn; /* queue pair number */ - tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ - kib_connreq_t *ibc_connreq; /* connection request state */ -} kib_conn_t; - -#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ -#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ -#define IBNAL_CONN_CONNECTING 2 /* started to connect */ -#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_DEATHROW 4 /* waiting to be closed */ -#define IBNAL_CONN_ZOMBIE 5 /* waiting to be freed */ - -typedef struct kib_peer -{ - struct list_head ibp_list; /* stash on global peer list */ - struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - ptl_nid_t ibp_nid; /* who's on the other end(s) */ - atomic_t ibp_refcount; /* # users */ - int ibp_persistence; /* "known" peer refs */ - struct list_head ibp_conns; /* all active connections */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* connecting+accepting */ - unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ - unsigned long ibp_reconnect_interval; /* exponential backoff */ -} kib_peer_t; - - -extern lib_nal_t kibnal_lib; -extern kib_data_t kibnal_data; -extern kib_tunables_t kibnal_tunables; - -static inline struct list_head * -kibnal_nid2peerlist (ptl_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; - - return (&kibnal_data.kib_peers [hash]); -} - -static inline int -kibnal_peer_active(kib_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->ibp_list)); -} - -static inline void -kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) -{ - /* CAVEAT EMPTOR: tx takes caller's ref on conn */ - - LASSERT (tx->tx_nsp > 0); /* work items set up */ - LASSERT (tx->tx_conn == NULL); /* only set here */ - - tx->tx_conn = conn; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); -} - -#define KIBNAL_SERVICE_KEY_MASK (IB_SA_SERVICE_COMP_MASK_NAME | \ - IB_SA_SERVICE_COMP_MASK_DATA8_1 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_2 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_3 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_4 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_5 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_6 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_7 | \ - IB_SA_SERVICE_COMP_MASK_DATA8_8) - -static inline __u64* -kibnal_service_nid_field(struct ib_common_attrib_service *srv) -{ - /* must be consistent with KIBNAL_SERVICE_KEY_MASK */ - return (__u64 *)srv->service_data8; -} - - -static inline void -kibnal_set_service_keys(struct ib_common_attrib_service *srv, ptl_nid_t nid) -{ - LASSERT (strlen (IBNAL_SERVICE_NAME) < sizeof(srv->service_name)); - memset (srv->service_name, 0, sizeof(srv->service_name)); - strcpy (srv->service_name, IBNAL_SERVICE_NAME); - - *kibnal_service_nid_field(srv) = cpu_to_le64(nid); -} - -#if 0 -static inline void -kibnal_show_rdma_attr (kib_conn_t *conn) -{ - struct ib_qp_attribute qp_attr; - int rc; - - memset (&qp_attr, 0, sizeof(qp_attr)); - rc = ib_qp_query(conn->ibc_qp, &qp_attr); - if (rc != 0) { - CERROR ("Can't get qp attrs: %d\n", rc); - return; - } - - CWARN ("RDMA CAPABILITY: write %s read %s\n", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); -} -#endif - -#if CONFIG_X86 -static inline __u64 -kibnal_page2phys (struct page *p) -{ - __u64 page_number = p - mem_map; - - return (page_number << PAGE_SHIFT); -} -#else -# error "no page->phys" -#endif - -/* CAVEAT EMPTOR: - * We rely on tx/rx descriptor alignment to allow us to use the lowest bit - * of the work request id as a flag to determine if the completion is for a - * transmit or a receive. It seems that that the CQ entry's 'op' field - * isn't always set correctly on completions that occur after QP teardown. */ - -static inline __u64 -kibnal_ptr2wreqid (void *ptr, int isrx) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT ((lptr & 1) == 0); - return (__u64)(lptr | (isrx ? 1 : 0)); -} - -static inline void * -kibnal_wreqid2ptr (__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~1UL); -} - -static inline int -kibnal_wreqid_is_rx (__u64 wreqid) -{ - return (wreqid & 1) != 0; -} - -extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); -extern void kibnal_put_peer (kib_peer_t *peer); -extern int kibnal_del_peer (ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); -extern void kibnal_unlink_peer_locked (kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_put_conn (kib_conn_t *conn); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages (kib_pages_t *p); - -extern void kibnal_check_sends (kib_conn_t *conn); - -extern tTS_IB_CM_CALLBACK_RETURN -kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, - void *param, void *arg); -extern tTS_IB_CM_CALLBACK_RETURN -kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_COMM_ID cid, - void *param, void *arg); - -extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd (void *arg); -extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); -extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -extern int kibnal_close_conn (kib_conn_t *conn, int why); -extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); - - - - - diff --git a/lustre/portals/knals/openibnal/openibnal_cb.c b/lustre/portals/knals/openibnal/openibnal_cb.c deleted file mode 100644 index d774853..0000000 --- a/lustre/portals/knals/openibnal/openibnal_cb.c +++ /dev/null @@ -1,2597 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "openibnal.h" - -/* - * LIB functions follow - * - */ -void -kibnal_schedule_tx_done (kib_tx_t *tx) -{ - unsigned long flags; - - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); - - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); -} - -void -kibnal_tx_done (kib_tx_t *tx) -{ - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - unsigned long flags; - int i; - int rc; - - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ - - switch (tx->tx_mapped) { - default: - LBUG(); - - case KIB_TX_UNMAPPED: - break; - - case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - rc = ib_memory_deregister(tx->tx_md.md_handle.mr); - LASSERT (rc == 0); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; - -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); - - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; -#endif - } - - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; - - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } - - if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; - tx->tx_status = 0; - - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); -} - -kib_tx_t * -kibnal_get_idle_tx (int may_block) -{ - unsigned long flags; - kib_tx_t *tx = NULL; - - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } - - /* block for idle tx */ - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); - } - - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } - - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - return (tx); -} - -int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) -{ - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) - continue; - - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); - - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); - - if (idle) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); - return; - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); -} - -void -kibnal_post_rx (kib_rx_t *rx, int do_credits) -{ - kib_conn_t *conn = rx->rx_conn; - int rc; - unsigned long flags; - - rx->rx_gl = (struct ib_gather_scatter) { - .address = rx->rx_vaddr, - .length = IBNAL_MSG_SIZE, - .key = conn->ibc_rx_pages->ibp_lkey, - }; - - rx->rx_sp = (struct ib_receive_param) { - .work_request_id = kibnal_ptr2wreqid(rx, 1), - .scatter_list = &rx->rx_gl, - .num_scatter_entries = 1, - .device_specific = NULL, - .signaled = 1, - }; - - LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - LASSERT (!rx->rx_posted); - rx->rx_posted = 1; - mb(); - - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else - rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1); - - if (rc == 0) { - if (do_credits) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_outstanding_credits++; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - return; - } - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - } - - /* Drop rx's ref */ - kibnal_put_conn (conn); -} - -#if IBNAL_CKSUM -__u32 kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - return (sum); -} -#endif - -void -kibnal_rx_callback (struct ib_cq_entry *e) -{ - kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(e->work_request_id); - kib_msg_t *msg = rx->rx_msg; - kib_conn_t *conn = rx->rx_conn; - int nob = e->bytes_transferred; - const int base_nob = offsetof(kib_msg_t, ibm_u); - int credits; - int flipped; - unsigned long flags; -#if IBNAL_CKSUM - __u32 msg_cksum; - __u32 computed_cksum; -#endif - - CDEBUG (D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; - mb(); - - /* receives complete with error in any case after we've started - * closing the QP */ - if (conn->ibc_state >= IBNAL_CONN_DEATHROW) - goto failed; - - /* We don't post receives until the conn is established */ - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, e->status); - goto failed; - } - - if (nob < base_nob) { - CERROR ("Short rx from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - - /* Receiver does any byte flipping if necessary... */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flipped = 0; - } else { - if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->ibm_magic, conn->ibc_peer->ibp_nid); - goto failed; - } - flipped = 1; - __swab16s (&msg->ibm_version); - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); - } - - if (msg->ibm_version != IBNAL_MSG_VERSION) { - CERROR ("Incompatible msg version %d (%d expected)\n", - msg->ibm_version, IBNAL_MSG_VERSION); - goto failed; - } - -#if IBNAL_CKSUM - if (nob != msg->ibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); - goto failed; - } - - msg_cksum = le32_to_cpu(msg->ibm_cksum); - msg->ibm_cksum = 0; - computed_cksum = kibnal_cksum (msg, nob); - - if (msg_cksum != computed_cksum) { - CERROR ("Checksum failure %d: (%d expected)\n", - computed_cksum, msg_cksum); - goto failed; - } - CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); -#endif - - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; - if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); - return; - - case IBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (kib_immediate_msg_t)) { - CERROR ("Short IMMEDIATE from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (kib_rdma_msg_t)) { - CERROR ("Short RDMA msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) { - __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_key); - __swab32s(&msg->ibm_u.rdma.ibrm_desc.rd_nob); - __swab64s(&msg->ibm_u.rdma.ibrm_desc.rd_addr); - } - CDEBUG(D_NET, "%d RDMA: cookie "LPX64", key %x, addr "LPX64", nob %d\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie, - msg->ibm_u.rdma.ibrm_desc.rd_key, - msg->ibm_u.rdma.ibrm_desc.rd_addr, - msg->ibm_u.rdma.ibrm_desc.rd_nob); - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (kib_completion_msg_t)) { - CERROR ("Short COMPLETION msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32s(&msg->ibm_u.completion.ibcm_status); - - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); - return; - - default: - CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->ibm_type); - goto failed; - } - - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); - - /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); -} - -void -kibnal_rx (kib_rx_t *rx) -{ - kib_msg_t *msg = rx->rx_msg; - - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; - - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; - - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); - break; - - case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - break; - - case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); - break; - - default: - LBUG(); - break; - } - - kibnal_post_rx (rx, 1); -} - -#if 0 -int -kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); - - if (page == NULL || - !VALID_PAGE (page)) - return (-EFAULT); - - *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); - return (0); -} -#endif - -int -kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, - int niov, struct iovec *iov, int offset, int nob) - -{ - void *vaddr; - int rc; - - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR ("Can't map multiple vaddr fragments\n"); - return (-EMSGSIZE); - } - - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); - - rc = ib_memory_register (kibnal_data.kib_pd, - vaddr, nob, - access, - &tx->tx_md.md_handle.mr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - - if (rc != 0) { - CERROR ("Can't map vaddr: %d\n", rc); - return (rc); - } - - tx->tx_mapped = KIB_TX_MAPPED; - return (0); -} - -int -kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, - int nkiov, ptl_kiov_t *kiov, - int offset, int nob) -{ -#if IBNAL_FMR - __u64 *phys; - const int mapped = KIB_TX_MAPPED_FMR; -#else - struct ib_physical_buffer *phys; - const int mapped = KIB_TX_MAPPED; -#endif - int page_offset; - int nphys; - int resid; - int phys_size; - int rc; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - - page_offset = kiov->kiov_offset + offset; -#if IBNAL_FMR - phys[0] = kibnal_page2phys(kiov->kiov_page); -#else - phys[0].address = kibnal_page2phys(kiov->kiov_page); - phys[0].size = PAGE_SIZE; -#endif - nphys = 1; - resid = nob - (kiov->kiov_len - offset); - - while (resid > 0) { - kiov++; - nkiov--; - LASSERT (nkiov > 0); - - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - int i; - /* Can't have gaps */ - CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - { - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } - - rc = -EINVAL; - goto out; - } - - if (nphys == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - LASSERT (nphys * sizeof (*phys) < phys_size); -#if IBNAL_FMR - phys[nphys] = kibnal_page2phys(kiov->kiov_page); -#else - phys[nphys].address = kibnal_page2phys(kiov->kiov_page); - phys[nphys].size = PAGE_SIZE; -#endif - nphys++; - - resid -= PAGE_SIZE; - } - -#if 0 - CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); - for (rc = 0; rc < nphys; rc++) - CWARN (" [%d] "LPX64" / %d\n", rc, phys[rc].address, phys[rc].size); -#endif - tx->tx_md.md_addr = IBNAL_RDMA_BASE; - -#if IBNAL_FMR - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - rc = ib_memory_register_physical (kibnal_data.kib_pd, - phys, nphys, - &tx->tx_md.md_addr, - nob, page_offset, - access, - &tx->tx_md.md_handle.mr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (rc == 0) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); - tx->tx_mapped = mapped; - } else { - CERROR ("Can't map phys: %d\n", rc); - rc = -EFAULT; - } - - out: - PORTAL_FREE(phys, phys_size); - return (rc); -} - -kib_conn_t * -kibnal_find_conn_locked (kib_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->ibp_conns) { - return (list_entry(tmp, kib_conn_t, ibc_list)); - } - - return (NULL); -} - -void -kibnal_check_sends (kib_conn_t *conn) -{ - unsigned long flags; - kib_tx_t *tx; - int rc; - int i; - int done; - int nwork; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); - - if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - tx = kibnal_get_idle_tx(0); /* don't block */ - if (tx != NULL) - kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); - kibnal_queue_tx_locked(tx, conn); - } - } - - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); - - /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); - - LASSERT (conn->ibc_outstanding_credits >= 0); - LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); - LASSERT (conn->ibc_credits >= 0); - LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) - break; - - if (conn->ibc_credits == 0) /* no credits */ - break; - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - break; - - list_del (&tx->tx_list); - - if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { - /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); - continue; - } - - tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; - conn->ibc_outstanding_credits = 0; - - conn->ibc_nsends_posted++; - conn->ibc_credits--; - - tx->tx_sending = tx->tx_nsp; - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; - list_add (&tx->tx_list, &conn->ibc_active_txs); -#if IBNAL_CKSUM - tx->tx_msg->ibm_cksum = 0; - tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); -#endif - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - - LASSERT (tx->tx_nsp > 0); - - rc = -ECONNABORTED; - nwork = 0; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - /* Driver only accepts 1 item at a time */ - for (i = 0; i < tx->tx_nsp; i++) { - rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1); - if (rc != 0) - break; - nwork++; - } - } - - spin_lock_irqsave (&conn->ibc_lock, flags); - if (rc != 0) { - /* NB credits are transferred in the actual - * message, which can only be the last work item */ - conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; - conn->ibc_nsends_posted--; - - tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - tx->tx_sending -= tx->tx_nsp - nwork; - - done = (tx->tx_sending == 0); - if (done) - list_del (&tx->tx_list); - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); - else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); - - kibnal_close_conn (conn, rc); - - if (done) - kibnal_tx_done (tx); - return; - } - - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); -} - -void -kibnal_tx_callback (struct ib_cq_entry *e) -{ - kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(e->work_request_id); - kib_conn_t *conn; - unsigned long flags; - int idle; - - conn = tx->tx_conn; - LASSERT (conn != NULL); - LASSERT (tx->tx_sending != 0); - - spin_lock_irqsave(&conn->ibc_lock, flags); - - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_nsp - tx->tx_sending, tx->tx_nsp, - e->status); - - /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ - - tx->tx_sending--; - idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ - if (idle) - list_del(&tx->tx_list); - - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - - if (tx->tx_sending == 0) - conn->ibc_nsends_posted--; - - if (e->status != IB_COMPLETION_STATUS_SUCCESS && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - if (idle) - kibnal_tx_done (tx); - - if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, e->status); - kibnal_close_conn (conn, -ENETDOWN); - } else { - /* can I shovel some more sends out the door? */ - kibnal_check_sends(conn); - } - - kibnal_put_conn (conn); -} - -void -kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) -{ - if (kibnal_wreqid_is_rx(e->work_request_id)) - kibnal_rx_callback (e); - else - kibnal_tx_callback (e); -} - -void -kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) -{ - struct ib_gather_scatter *gl = &tx->tx_gl[tx->tx_nsp]; - struct ib_send_param *sp = &tx->tx_sp[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_sp)/sizeof(tx->tx_sp[0])); - LASSERT (nob <= IBNAL_MSG_SIZE); - - tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; - tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; - tx->tx_msg->ibm_type = type; -#if IBNAL_CKSUM - tx->tx_msg->ibm_nob = nob; -#endif - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); - - *gl = (struct ib_gather_scatter) { - .address = tx->tx_vaddr, - .length = nob, - .key = kibnal_data.kib_tx_pages->ibp_lkey, - }; - - /* NB If this is an RDMA read, the completion message must wait for - * the RDMA to complete. Sends wait for previous RDMA writes - * anyway... */ - *sp = (struct ib_send_param) { - .work_request_id = kibnal_ptr2wreqid(tx, 0), - .op = IB_OP_SEND, - .gather_list = gl, - .num_gather_entries = 1, - .device_specific = NULL, - .solicited_event = 1, - .signaled = 1, - .immediate_data_valid = 0, - .fence = fence, - .inline_data = 0, - }; - - tx->tx_nsp++; -} - -void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) -{ - unsigned long flags; - - spin_lock_irqsave(&conn->ibc_lock, flags); - - kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); -} - -void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) -{ - unsigned long flags; - kib_peer_t *peer; - kib_conn_t *conn; - rwlock_t *g_lock = &kibnal_data.kib_global_lock; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ - - read_lock (g_lock); - - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock (g_lock); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - read_unlock (g_lock); - - kibnal_queue_tx (tx, conn); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock (g_lock); - write_lock_irqsave (g_lock, flags); - - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - write_unlock_irqrestore (g_lock, flags); - - kibnal_queue_tx (tx, conn); - return; - } - - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - peer->ibp_connecting = 1; - atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); - } - - /* A connection is being established; queue the message... */ - list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - - write_unlock_irqrestore (g_lock, flags); -} - -ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) -{ - int nob = libmsg->md->length; - kib_tx_t *tx; - kib_msg_t *ibmsg; - int rc; - int access; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || - type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ - - if (type == IBNAL_MSG_PUT_RDMA) { - access = IB_ACCESS_REMOTE_READ; - } else { - access = IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE; - } - - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); - - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob); - else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob); - - if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); - rc = -ENOMEM; - goto failed; - } - } - - tx->tx_passive_rdma = 1; - - ibmsg = tx->tx_msg; - - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc.rd_nob = nob; - - kibnal_init_tx_msg (tx, type, sizeof (kib_rdma_msg_t)); - - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; - - kibnal_launch_tx(tx, nid); - return (PTL_OK); - - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (PTL_FAIL); -} - -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - int access; - int rdma_op; - int rc; - - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); - - /* Called by scheduler */ - LASSERT (!in_interrupt ()); - - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); - - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); - - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; - - if (type == IBNAL_MSG_GET_DONE) { - access = 0; - rdma_op = IB_OP_RDMA_WRITE; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access = IB_ACCESS_LOCAL_WRITE; - rdma_op = IB_OP_RDMA_READ; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); - } - - tx = kibnal_get_idle_tx (0); /* Mustn't block */ - if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 - " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); - return; - } - LASSERT (tx->tx_nsp == 0); - - if (nob != 0) { - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, - niov, kiov, offset, nob); - else - rc = kibnal_map_iov (tx, access, - niov, iov, offset, nob); - - if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - } else { - tx->tx_gl[0] = (struct ib_gather_scatter) { - .address = tx->tx_md.md_addr, - .length = nob, - .key = tx->tx_md.md_lkey, - }; - - tx->tx_sp[0] = (struct ib_send_param) { - .work_request_id = kibnal_ptr2wreqid(tx, 0), - .op = rdma_op, - .gather_list = &tx->tx_gl[0], - .num_gather_entries = 1, - .remote_address = rxmsg->ibm_u.rdma.ibrm_desc.rd_addr, - .rkey = rxmsg->ibm_u.rdma.ibrm_desc.rd_key, - .device_specific = NULL, - .solicited_event = 0, - .signaled = 1, - .immediate_data_valid = 0, - .fence = 0, - .inline_data = 0, - }; - - tx->tx_nsp = 1; - } - } - - txmsg = tx->tx_msg; - - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; - - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; - } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CDEBUG(D_WARNING,"No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); - } - - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ - kibnal_queue_tx(tx, rx->rx_conn); -} - -ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - switch (type) { - default: - LBUG(); - return (PTL_FAIL); - - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - return (PTL_OK); - } - - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); - } - - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob >= IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", - nid, payload_nob); - return (PTL_FAIL); - } - break; - } - - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; - - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; - - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); - - break; - } - - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); - if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } - - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); - - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; - - kibnal_launch_tx(tx, nid); - return (PTL_OK); -} - -ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} - -ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); -} - -ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch (rxmsg->ibm_type) { - default: - LBUG(); - return (PTL_FAIL); - - case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); - } - - if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); - - case IBNAL_MSG_GET_RDMA: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); - - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); - return (PTL_OK); - } -} - -ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); -} - -ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); -} - -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} - -void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); -} - -void -kibnal_close_conn_locked (kib_conn_t *conn, int error) -{ - /* This just does the immmediate housekeeping, and schedules the - * connection for the connd to finish off. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; - - CDEBUG (error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || - conn->ibc_state == IBNAL_CONN_CONNECTING); - - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_connd_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_connd_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - } - - if (list_empty (&peer->ibp_conns) && - peer->ibp_persistence == 0) { - /* Non-persistent peer with no more conns... */ - kibnal_unlink_peer_locked (peer); - } - - conn->ibc_state = IBNAL_CONN_DEATHROW; - - /* Schedule conn for closing/destruction */ - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); -} - -int -kibnal_close_conn (kib_conn_t *conn, int why) -{ - unsigned long flags; - int count = 0; - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); - - if (conn->ibc_state <= IBNAL_CONN_ESTABLISHED) { - count = 1; - kibnal_close_conn_locked (conn, why); - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return (count); -} - -void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) -{ - LIST_HEAD (zombies); - kib_tx_t *tx; - unsigned long flags; - - LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; - - if (peer->ibp_connecting != 0) { - /* another connection attempt under way (loopback?)... */ - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - return; - } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); - - /* Take peer's blocked blocked transmits; I'll complete - * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); - - while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - } -} - -void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) -{ - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int rc; - int i; - - /* passive connection has no connreq & vice versa */ - LASSERT (!active == !(conn->ibc_connreq != NULL)); - if (active) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - if (state == IBNAL_CONN_CONNECTING) { - /* Install common (active/passive) callback for - * disconnect/idle notification if I got as far as getting - * a CM comm_id */ - rc = tsIbCmCallbackModify(conn->ibc_comm_id, - kibnal_conn_callback, conn); - LASSERT (rc == 0); - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - LASSERT (peer->ibp_connecting != 0); - - if (status == 0) { - /* connection established... */ - LASSERT (state == IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_ESTABLISHED; - - if (!kibnal_peer_active(peer)) { - /* ...but peer deleted meantime */ - status = -ECONNABORTED; - } - } else { - LASSERT (state == IBNAL_CONN_INIT_QP || - state == IBNAL_CONN_CONNECTING); - } - - if (status == 0) { - /* Everything worked! */ - - peer->ibp_connecting--; - - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; - - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - kibnal_queue_tx_locked (tx, conn); - } - - spin_unlock (&conn->ibc_lock); - - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - - CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, - conn->ibc_rxs[i].rx_vaddr); - - kibnal_post_rx (&conn->ibc_rxs[i], 0); - } - - kibnal_check_sends (conn); - return; - } - - /* connection failed */ - if (state == IBNAL_CONN_CONNECTING) { - /* schedule for connd to close */ - kibnal_close_conn_locked (conn, status); - } else { - /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = IBNAL_CONN_ZOMBIE; - } - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - kibnal_peer_connect_failed (conn->ibc_peer, active, status); - - if (state != IBNAL_CONN_CONNECTING) { - /* drop caller's ref if we're not waiting for the - * IB_CM_IDLE callback */ - kibnal_put_conn (conn); - } -} - -int -kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, - ptl_nid_t nid, __u64 incarnation, int queue_depth) -{ - kib_conn_t *conn = kibnal_create_conn(); - kib_peer_t *peer; - kib_peer_t *peer2; - unsigned long flags; - - if (conn == NULL) - return (-ENOMEM); - - if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); - return (-EPROTO); - } - - /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (nid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-ENOMEM); - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - - peer2 = kibnal_find_peer_locked(nid); - if (peer2 == NULL) { - /* peer table takes my ref on peer */ - list_add_tail (&peer->ibp_list, - kibnal_nid2peerlist(nid)); - } else { - kibnal_put_peer (peer); - peer = peer2; - } - - /* +1 ref for conn */ - atomic_inc (&peer->ibp_refcount); - peer->ibp_connecting++; - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - - conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - conn->ibc_comm_id = cid; - conn->ibc_incarnation = incarnation; - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - *connp = conn; - return (0); -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_idle_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - /* Shouldn't ever get a callback after TS_IB_CM_IDLE */ - CERROR ("Unexpected event %d: conn %p\n", event, arg); - LBUG (); - return TS_IB_CM_CALLBACK_PROCEED; -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - int done; - int rc; - - /* Established Connection Notifier */ - - switch (event) { - default: - CERROR("Connection %p -> "LPX64" ERROR %d\n", - conn, conn->ibc_peer->ibp_nid, event); - kibnal_close_conn (conn, -ECONNABORTED); - break; - - case TS_IB_CM_DISCONNECTED: - CDEBUG(D_WARNING, "Connection %p -> "LPX64" DISCONNECTED.\n", - conn, conn->ibc_peer->ibp_nid); - kibnal_close_conn (conn, 0); - break; - - case TS_IB_CM_IDLE: - CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n", - conn, conn->ibc_peer->ibp_nid); - kibnal_put_conn (conn); /* Lose CM's ref */ - - /* LASSERT (no further callbacks) */ - rc = tsIbCmCallbackModify(cid, - kibnal_idle_conn_callback, conn); - LASSERT (rc == 0); - - /* NB we wait until the connection has closed before - * completing outstanding passive RDMAs so we can be sure - * the network can't touch the mapped memory any more. */ - - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - if (!done) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del(&tx->tx_list); - kibnal_tx_done (tx); - } - break; - } - - return TS_IB_CM_CALLBACK_PROCEED; -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - int rc; - - switch (event) { - default: - if (conn == NULL) { - /* no connection yet */ - CERROR ("Unexpected event: %d\n", event); - return TS_IB_CM_CALLBACK_ABORT; - } - - CERROR ("Unexpected event %p -> "LPX64": %d\n", - conn, conn->ibc_peer->ibp_nid, event); - kibnal_connreq_done (conn, 0, -ECONNABORTED); - break; - - case TS_IB_CM_REQ_RECEIVED: { - struct ib_cm_req_received_param *req = param; - kib_wire_connreq_t *wcr = req->remote_private_data; - - LASSERT (conn == NULL); - - CDEBUG(D_NET, "REQ from "LPX64"\n", le64_to_cpu(wcr->wcr_nid)); - - if (req->remote_private_data_len < sizeof (*wcr)) { - CERROR("Connect from remote LID %04x: too short %d\n", - req->dlid, req->remote_private_data_len); - return TS_IB_CM_CALLBACK_ABORT; - } - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't accept LID %04x: bad magic %08x\n", - req->dlid, le32_to_cpu(wcr->wcr_magic)); - return TS_IB_CM_CALLBACK_ABORT; - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't accept LID %04x: bad version %d\n", - req->dlid, le16_to_cpu(wcr->wcr_magic)); - return TS_IB_CM_CALLBACK_ABORT; - } - - rc = kibnal_accept(&conn, - cid, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); - if (rc != 0) { - CERROR ("Can't accept "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), rc); - return TS_IB_CM_CALLBACK_ABORT; - } - - /* update 'arg' for next callback */ - rc = tsIbCmCallbackModify(cid, - kibnal_passive_conn_callback, conn); - LASSERT (rc == 0); - - req->accept_param.qp = conn->ibc_qp; - *((kib_wire_connreq_t *)req->accept_param.reply_private_data) - = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - req->accept_param.reply_private_data_len = sizeof(kib_wire_connreq_t); - req->accept_param.responder_resources = IBNAL_RESPONDER_RESOURCES; - req->accept_param.initiator_depth = IBNAL_RESPONDER_RESOURCES; - req->accept_param.rnr_retry_count = IBNAL_RNR_RETRY; - req->accept_param.flow_control = IBNAL_FLOW_CONTROL; - - CDEBUG(D_NET, "Proceeding\n"); - break; - } - - case TS_IB_CM_ESTABLISHED: - LASSERT (conn != NULL); - CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 0, 0); - break; - } - - /* NB if the connreq is done, we switch to kibnal_conn_callback */ - return TS_IB_CM_CALLBACK_PROCEED; -} - -tTS_IB_CM_CALLBACK_RETURN -kibnal_active_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) -{ - kib_conn_t *conn = arg; - - switch (event) { - case TS_IB_CM_REP_RECEIVED: { - struct ib_cm_rep_received_param *rep = param; - kib_wire_connreq_t *wcr = rep->remote_private_data; - - if (rep->remote_private_data_len < sizeof (*wcr)) { - CERROR ("Short reply from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, - rep->remote_private_data_len); - kibnal_connreq_done (conn, 1, -EPROTO); - break; - } - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't connect "LPX64": bad magic %08x\n", - conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - kibnal_connreq_done (conn, 1, -EPROTO); - break; - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't connect "LPX64": bad version %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - kibnal_connreq_done (conn, 1, -EPROTO); - break; - } - - if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { - CERROR ("Can't connect "LPX64": bad queue depth %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_queue_depth)); - kibnal_connreq_done (conn, 1, -EPROTO); - break; - } - - if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { - CERROR ("Unexpected NID "LPX64" from "LPX64"\n", - le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - kibnal_connreq_done (conn, 1, -EPROTO); - break; - } - - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); - conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - break; - } - - case TS_IB_CM_ESTABLISHED: - CDEBUG(D_WARNING, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 1, 0); - break; - - case TS_IB_CM_IDLE: - CERROR("Connection %p -> "LPX64" IDLE\n", - conn, conn->ibc_peer->ibp_nid); - /* Back out state change: I'm disengaged from CM */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - - kibnal_connreq_done (conn, 1, -ECONNABORTED); - break; - - default: - CERROR("Connection %p -> "LPX64" ERROR %d\n", - conn, conn->ibc_peer->ibp_nid, event); - kibnal_connreq_done (conn, 1, -ECONNABORTED); - break; - } - - /* NB if the connreq is done, we switch to kibnal_conn_callback */ - return TS_IB_CM_CALLBACK_PROCEED; -} - -int -kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_path_record *resp, int remaining, - void *arg) -{ - kib_conn_t *conn = arg; - - if (status != 0) { - CERROR ("status %d\n", status); - kibnal_connreq_done (conn, 1, status); - goto out; - } - - conn->ibc_connreq->cr_path = *resp; - - conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { - .qp = conn->ibc_qp, - .req_private_data = &conn->ibc_connreq->cr_wcr, - .req_private_data_len = sizeof(conn->ibc_connreq->cr_wcr), - .responder_resources = IBNAL_RESPONDER_RESOURCES, - .initiator_depth = IBNAL_RESPONDER_RESOURCES, - .retry_count = IBNAL_RETRY, - .rnr_retry_count = IBNAL_RNR_RETRY, - .cm_response_timeout = kibnal_tunables.kib_io_timeout, - .max_cm_retries = IBNAL_CM_RETRY, - .flow_control = IBNAL_FLOW_CONTROL, - }; - - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; - - /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; - - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_service.service_id, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - - /* kibnal_connect_callback gets my conn ref */ - status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, - &conn->ibc_connreq->cr_path, NULL, - conn->ibc_connreq->cr_service.service_id, 0, - kibnal_active_conn_callback, conn, - &conn->ibc_comm_id); - if (status != 0) { - CERROR ("Connect: %d\n", status); - /* Back out state change: I've not got a CM comm_id yet... */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, status); - } - - out: - /* return non-zero to prevent further callbacks */ - return 1; -} - -void -kibnal_service_get_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, - struct ib_common_attrib_service *resp, void *arg) -{ - kib_conn_t *conn = arg; - - if (status != 0) { - CERROR ("status %d\n", status); - kibnal_connreq_done (conn, 1, status); - return; - } - - CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", - status, resp->service_id, - *kibnal_service_nid_field(resp)); - - conn->ibc_connreq->cr_service = *resp; - - status = ib_cached_gid_get(kibnal_data.kib_device, - kibnal_data.kib_port, 0, - conn->ibc_connreq->cr_gid); - LASSERT (status == 0); - - /* kibnal_pathreq_callback gets my conn ref */ - status = tsIbPathRecordRequest (kibnal_data.kib_device, - kibnal_data.kib_port, - conn->ibc_connreq->cr_gid, - conn->ibc_connreq->cr_service.service_gid, - conn->ibc_connreq->cr_service.service_pkey, - 0, - kibnal_tunables.kib_io_timeout * HZ, - 0, - kibnal_pathreq_callback, conn, - &conn->ibc_connreq->cr_tid); - - if (status == 0) - return; - - CERROR ("Path record request: %d\n", status); - kibnal_connreq_done (conn, 1, status); -} - -void -kibnal_connect_peer (kib_peer_t *peer) -{ - kib_conn_t *conn = kibnal_create_conn(); - int rc; - - LASSERT (peer->ibp_connecting != 0); - - if (conn == NULL) { - CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); - return; - } - - conn->ibc_peer = peer; - atomic_inc (&peer->ibp_refcount); - - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done (conn, 1, -ENOMEM); - return; - } - - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); - - kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); - - /* kibnal_service_get_callback gets my conn ref */ - rc = ib_service_get (kibnal_data.kib_device, - kibnal_data.kib_port, - &conn->ibc_connreq->cr_service, - KIBNAL_SERVICE_KEY_MASK, - kibnal_tunables.kib_io_timeout * HZ, - kibnal_service_get_callback, conn, - &conn->ibc_connreq->cr_tid); - - if (rc == 0) - return; - - CERROR ("ib_service_get: %d\n", rc); - kibnal_connreq_done (conn, 1, rc); -} - -int -kibnal_conn_timed_out (kib_conn_t *conn) -{ - kib_tx_t *tx; - struct list_head *ttmp; - unsigned long flags; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_tx_queue) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; - } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; - } - } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - return 0; -} - -void -kibnal_check_conns (int idx) -{ - struct list_head *peers = &kibnal_data.kib_peers[idx]; - struct list_head *ptmp; - kib_peer_t *peer; - kib_conn_t *conn; - struct list_head *ctmp; - - again: - /* NB. We expect to have a look at all the peers and not find any - * rdmas to time out, so we just use a shared lock while we - * take a look... */ - read_lock (&kibnal_data.kib_global_lock); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, kib_peer_t, ibp_list); - - list_for_each (ctmp, &peer->ibp_conns) { - conn = list_entry (ctmp, kib_conn_t, ibc_list); - - LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); - - - /* In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... */ - kibnal_check_sends(conn); - - if (!kibnal_conn_timed_out(conn)) - continue; - - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - atomic_inc (&conn->ibc_refcount); - read_unlock (&kibnal_data.kib_global_lock); - - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); - - kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); - - /* start again now I've dropped the lock */ - goto again; - } - } - - read_unlock (&kibnal_data.kib_global_lock); -} - -void -kibnal_terminate_conn (kib_conn_t *conn) -{ - int rc; - - CDEBUG(D_NET, "conn %p\n", conn); - LASSERT (conn->ibc_state == IBNAL_CONN_DEATHROW); - conn->ibc_state = IBNAL_CONN_ZOMBIE; - - rc = ib_cm_disconnect (conn->ibc_comm_id); - if (rc != 0) - CERROR ("Error %d disconnecting conn %p -> "LPX64"\n", - rc, conn, conn->ibc_peer->ibp_nid); -} - -int -kibnal_connd (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kib_conn_t *conn; - kib_peer_t *peer; - int timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - kportal_daemonize ("kibnal_connd"); - kportal_blockallsigs (); - - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - for (;;) { - if (!list_empty (&kibnal_data.kib_connd_conns)) { - conn = list_entry (kibnal_data.kib_connd_conns.next, - kib_conn_t, ibc_list); - list_del (&conn->ibc_list); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - switch (conn->ibc_state) { - case IBNAL_CONN_DEATHROW: - LASSERT (conn->ibc_comm_id != TS_IB_CM_COMM_ID_INVALID); - /* Disconnect: conn becomes a zombie in the - * callback and last ref reschedules it - * here... */ - kibnal_terminate_conn(conn); - kibnal_put_conn (conn); - break; - - case IBNAL_CONN_ZOMBIE: - kibnal_destroy_conn (conn); - break; - - default: - CERROR ("Bad conn %p state: %d\n", - conn, conn->ibc_state); - LBUG(); - } - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - continue; - } - - if (!list_empty (&kibnal_data.kib_connd_peers)) { - peer = list_entry (kibnal_data.kib_connd_peers.next, - kib_peer_t, ibp_connd_list); - - list_del_init (&peer->ibp_connd_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_connect_peer (peer); - kibnal_put_peer (peer); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - /* shut down and nobody left to reap... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { - const int n = 4; - const int p = 1; - int chunk = kibnal_data.kib_peer_hash_size; - - /* Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. */ - - if (kibnal_tunables.kib_io_timeout > n * p) - chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); - peer_index = (peer_index + 1) % - kibnal_data.kib_peer_hash_size; - } - - deadline += p * HZ; - } - - kibnal_data.kib_connd_waketime = jiffies + timeout; - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - if (!kibnal_data.kib_shutdown && - list_empty (&kibnal_data.kib_connd_conns) && - list_empty (&kibnal_data.kib_connd_peers)) - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - } - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - - kibnal_thread_fini (); - return (0); -} - -int -kibnal_scheduler(void *arg) -{ - long id = (long)arg; - char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; - - snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - for (;;) { - did_something = 0; - - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - kibnal_tx_done(tx); - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - - kibnal_rx(rx); - - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - - /* shut down and no receives to complete... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, - flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - (kibnal_data.kib_shutdown && - atomic_read (&kibnal_data.kib_nconns) == 0)); - } else { - our_cond_resched(); - } - - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } - } - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - - kibnal_thread_fini(); - return (0); -} - - -lib_nal_t kibnal_lib = { - libnal_data: &kibnal_data, /* NAL private data */ - libnal_send: kibnal_send, - libnal_send_pages: kibnal_send_pages, - libnal_recv: kibnal_recv, - libnal_recv_pages: kibnal_recv_pages, - libnal_dist: kibnal_dist -}; diff --git a/lustre/portals/knals/qswnal/.cvsignore b/lustre/portals/knals/qswnal/.cvsignore deleted file mode 100644 index 48b17e9..0000000 --- a/lustre/portals/knals/qswnal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.*.cmd -.tmp_versions -.depend diff --git a/lustre/portals/knals/qswnal/Makefile.in b/lustre/portals/knals/qswnal/Makefile.in deleted file mode 100644 index d27240c..0000000 --- a/lustre/portals/knals/qswnal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kqswnal -kqswnal-objs := qswnal.o qswnal_cb.o - -EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/qswnal/autoMakefile.am b/lustre/portals/knals/qswnal/autoMakefile.am deleted file mode 100644 index b5b2e07..0000000 --- a/lustre/portals/knals/qswnal/autoMakefile.am +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -if BUILD_QSWNAL -modulenet_DATA = kqswnal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kqswnal-objs:%.o=%.c) qswnal.h diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c deleted file mode 100644 index 5aff4e9..0000000 --- a/lustre/portals/knals/qswnal/qswnal.c +++ /dev/null @@ -1,800 +0,0 @@ -/* - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Eric Barton - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * W. Marcus Miller - Based on ksocknal - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "qswnal.h" - -nal_t kqswnal_api; -kqswnal_data_t kqswnal_data; -ptl_handle_ni_t kqswnal_ni; -kqswnal_tunables_t kqswnal_tunables; - -kpr_nal_interface_t kqswnal_router_interface = { - kprni_nalid: QSWNAL, - kprni_arg: NULL, - kprni_fwd: kqswnal_fwd_packet, - kprni_notify: NULL, /* we're connectionless */ -}; - -#if CONFIG_SYSCTL -#define QSWNAL_SYSCTL 201 - -#define QSWNAL_SYSCTL_OPTIMIZED_GETS 1 -#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2 - -static ctl_table kqswnal_ctl_table[] = { - {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts", - &kqswnal_tunables.kqn_optimized_puts, sizeof (int), - 0644, NULL, &proc_dointvec}, - {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", - &kqswnal_tunables.kqn_optimized_gets, sizeof (int), - 0644, NULL, &proc_dointvec}, - {0} -}; - -static ctl_table kqswnal_top_ctl_table[] = { - {QSWNAL_SYSCTL, "qswnal", NULL, 0, 0555, kqswnal_ctl_table}, - {0} -}; -#endif - -int -kqswnal_get_tx_desc (struct portals_cfg *pcfg) -{ - unsigned long flags; - struct list_head *tmp; - kqswnal_tx_t *ktx; - ptl_hdr_t *hdr; - int index = pcfg->pcfg_count; - int rc = -ENOENT; - - spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - - list_for_each (tmp, &kqswnal_data.kqn_activetxds) { - if (index-- != 0) - continue; - - ktx = list_entry (tmp, kqswnal_tx_t, ktx_list); - hdr = (ptl_hdr_t *)ktx->ktx_buffer; - - pcfg->pcfg_pbuf1 = (char *)ktx; - pcfg->pcfg_count = le32_to_cpu(hdr->type); - pcfg->pcfg_size = le32_to_cpu(hdr->payload_length); - pcfg->pcfg_nid = le64_to_cpu(hdr->dest_nid); - pcfg->pcfg_nid2 = ktx->ktx_nid; - pcfg->pcfg_misc = ktx->ktx_launcher; - pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | - (!ktx->ktx_isnblk ? 0 : 2) | - (ktx->ktx_state << 2); - rc = 0; - break; - } - - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - return (rc); -} - -int -kqswnal_cmd (struct portals_cfg *pcfg, void *private) -{ - LASSERT (pcfg != NULL); - - switch (pcfg->pcfg_command) { - case NAL_CMD_GET_TXDESC: - return (kqswnal_get_tx_desc (pcfg)); - - case NAL_CMD_REGISTER_MYNID: - CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", - pcfg->pcfg_nid - kqswnal_data.kqn_elanid, - kqswnal_data.kqn_nid_offset); - kqswnal_data.kqn_nid_offset = - pcfg->pcfg_nid - kqswnal_data.kqn_elanid; - kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; - return (0); - - default: - return (-EINVAL); - } -} - -static void -kqswnal_shutdown(nal_t *nal) -{ - unsigned long flags; - kqswnal_tx_t *ktx; - kqswnal_rx_t *krx; - int do_lib_fini = 0; - - /* NB The first ref was this module! */ - if (nal->nal_refct != 0) { - PORTAL_MODULE_UNUSE; - return; - } - - CDEBUG (D_NET, "shutdown\n"); - LASSERT (nal == &kqswnal_api); - - switch (kqswnal_data.kqn_init) - { - default: - LASSERT (0); - - case KQN_INIT_ALL: - libcfs_nal_cmd_unregister(QSWNAL); - /* fall through */ - - case KQN_INIT_LIB: - do_lib_fini = 1; - /* fall through */ - - case KQN_INIT_DATA: - break; - - case KQN_INIT_NOTHING: - return; - } - - /**********************************************************************/ - /* Tell router we're shutting down. Any router calls my threads - * make will now fail immediately and the router will stop calling - * into me. */ - kpr_shutdown (&kqswnal_data.kqn_router); - - /**********************************************************************/ - /* Signal the start of shutdown... */ - spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); - kqswnal_data.kqn_shuttingdown = 1; - spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - - wake_up_all(&kqswnal_data.kqn_idletxd_waitq); - - /**********************************************************************/ - /* wait for sends that have allocated a tx desc to launch or give up */ - while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) { - CDEBUG(D_NET, "waiting for %d pending sends\n", - atomic_read (&kqswnal_data.kqn_pending_txs)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - /**********************************************************************/ - /* close elan comms */ -#if MULTIRAIL_EKC - /* Shut down receivers first; rx callbacks might try sending... */ - if (kqswnal_data.kqn_eprx_small != NULL) - ep_free_rcvr (kqswnal_data.kqn_eprx_small); - - if (kqswnal_data.kqn_eprx_large != NULL) - ep_free_rcvr (kqswnal_data.kqn_eprx_large); - - /* NB ep_free_rcvr() returns only after we've freed off all receive - * buffers (see shutdown handling in kqswnal_requeue_rx()). This - * means we must have completed any messages we passed to - * lib_parse() or kpr_fwd_start(). */ - - if (kqswnal_data.kqn_eptx != NULL) - ep_free_xmtr (kqswnal_data.kqn_eptx); - - /* NB ep_free_xmtr() returns only after all outstanding transmits - * have called their callback... */ - LASSERT(list_empty(&kqswnal_data.kqn_activetxds)); -#else - /* "Old" EKC just pretends to shutdown cleanly but actually - * provides no guarantees */ - if (kqswnal_data.kqn_eprx_small != NULL) - ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); - - if (kqswnal_data.kqn_eprx_large != NULL) - ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); - - /* wait for transmits to complete */ - while (!list_empty(&kqswnal_data.kqn_activetxds)) { - CWARN("waiting for active transmits to complete\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - if (kqswnal_data.kqn_eptx != NULL) - ep_free_large_xmtr (kqswnal_data.kqn_eptx); -#endif - /**********************************************************************/ - /* flag threads to terminate, wake them and wait for them to die */ - kqswnal_data.kqn_shuttingdown = 2; - wake_up_all (&kqswnal_data.kqn_sched_waitq); - - while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { - CDEBUG(D_NET, "waiting for %d threads to terminate\n", - atomic_read (&kqswnal_data.kqn_nthreads)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - /**********************************************************************/ - /* No more threads. No more portals, router or comms callbacks! - * I control the horizontals and the verticals... - */ - -#if MULTIRAIL_EKC - LASSERT (list_empty (&kqswnal_data.kqn_readyrxds)); - LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds)); - LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds)); -#endif - - /**********************************************************************/ - /* Complete any blocked forwarding packets, with error - */ - - while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) - { - kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, - kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN); - } - - /**********************************************************************/ - /* finalise router and portals lib */ - - kpr_deregister (&kqswnal_data.kqn_router); - - if (do_lib_fini) - lib_fini (&kqswnal_lib); - - /**********************************************************************/ - /* Unmap message buffers and free all descriptors and buffers - */ - -#if MULTIRAIL_EKC - /* FTTB, we need to unmap any remaining mapped memory. When - * ep_dvma_release() get fixed (and releases any mappings in the - * region), we can delete all the code from here --------> */ - - for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) { - /* If ktx has a buffer, it got mapped; unmap now. NB only - * the pre-mapped stuff is still mapped since all tx descs - * must be idle */ - - if (ktx->ktx_buffer != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_ebuffer); - } - - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { - /* If krx_kiov[0].kiov_page got allocated, it got mapped. - * NB subsequent pages get merged */ - - if (krx->krx_kiov[0].kiov_page != NULL) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_rx_nmh, - &krx->krx_elanbuffer); - } - /* <----------- to here */ - - if (kqswnal_data.kqn_ep_rx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh); - - if (kqswnal_data.kqn_ep_tx_nmh != NULL) - ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh); -#else - if (kqswnal_data.kqn_eprxdmahandle != NULL) - { - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle, 0, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); - - elan3_dma_release(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle); - } - - if (kqswnal_data.kqn_eptxdmahandle != NULL) - { - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, 0, - KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + - KQSW_NNBLK_TXMSGS)); - - elan3_dma_release(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle); - } -#endif - - while (kqswnal_data.kqn_txds != NULL) { - ktx = kqswnal_data.kqn_txds; - - if (ktx->ktx_buffer != NULL) - PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - - kqswnal_data.kqn_txds = ktx->ktx_alloclist; - PORTAL_FREE(ktx, sizeof(*ktx)); - } - - while (kqswnal_data.kqn_rxds != NULL) { - int i; - - krx = kqswnal_data.kqn_rxds; - for (i = 0; i < krx->krx_npages; i++) - if (krx->krx_kiov[i].kiov_page != NULL) - __free_page (krx->krx_kiov[i].kiov_page); - - kqswnal_data.kqn_rxds = krx->krx_alloclist; - PORTAL_FREE(krx, sizeof (*krx)); - } - - /* resets flags, pointers to NULL etc */ - memset(&kqswnal_data, 0, sizeof (kqswnal_data)); - - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); - - printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); -} - -static int -kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ -#if MULTIRAIL_EKC - EP_RAILMASK all_rails = EP_RAILMASK_ALL; -#else - ELAN3_DMA_REQUEST dmareq; -#endif - int rc; - int i; - kqswnal_rx_t *krx; - kqswnal_tx_t *ktx; - int elan_page_idx; - ptl_process_id_t my_process_id; - int pkmem = atomic_read(&portal_kmemory); - - LASSERT (nal == &kqswnal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } - - LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - - /* ensure all pointers NULL etc */ - memset (&kqswnal_data, 0, sizeof (kqswnal_data)); - - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); - INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); - INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); - spin_lock_init (&kqswnal_data.kqn_idletxd_lock); - init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); - - INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); - INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); - INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); - - spin_lock_init (&kqswnal_data.kqn_sched_lock); - init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - - /* Leave kqn_rpc_success zeroed */ -#if MULTIRAIL_EKC - kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; -#else - kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; -#endif - - /* pointers/lists/locks initialised */ - kqswnal_data.kqn_init = KQN_INIT_DATA; - -#if MULTIRAIL_EKC - kqswnal_data.kqn_ep = ep_system(); - if (kqswnal_data.kqn_ep == NULL) { - CERROR("Can't initialise EKC\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); - } - - if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { - CERROR("Can't get elan ID\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); - } -#else - /**********************************************************************/ - /* Find the first Elan device */ - - kqswnal_data.kqn_ep = ep_device (0); - if (kqswnal_data.kqn_ep == NULL) - { - CERROR ("Can't get elan device 0\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); - } -#endif - - kqswnal_data.kqn_nid_offset = 0; - kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep); - kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep); - - /**********************************************************************/ - /* Get the transmitter */ - - kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep); - if (kqswnal_data.kqn_eptx == NULL) - { - CERROR ("Can't allocate transmitter\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - /**********************************************************************/ - /* Get the receivers */ - - kqswnal_data.kqn_eprx_small = ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_SMALL, - KQSW_EP_ENVELOPES_SMALL); - if (kqswnal_data.kqn_eprx_small == NULL) - { - CERROR ("Can't install small msg receiver\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_LARGE, - KQSW_EP_ENVELOPES_LARGE); - if (kqswnal_data.kqn_eprx_large == NULL) - { - CERROR ("Can't install large msg receiver\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - /**********************************************************************/ - /* Reserve Elan address space for transmit descriptors NB we may - * either send the contents of associated buffers immediately, or - * map them for the peer to suck/blow... */ -#if MULTIRAIL_EKC - kqswnal_data.kqn_ep_tx_nmh = - ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), - EP_PERM_WRITE); - if (kqswnal_data.kqn_ep_tx_nmh == NULL) { - CERROR("Can't reserve tx dma space\n"); - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); - } -#else - dmareq.Waitfn = DDI_DMA_SLEEP; - dmareq.ElanAddr = (E3_Addr) 0; - dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; - dmareq.Perm = ELAN_PERM_REMOTEWRITE; - - rc = elan3_dma_reserve(kqswnal_data.kqn_ep->DmaState, - KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), - &dmareq, &kqswnal_data.kqn_eptxdmahandle); - if (rc != DDI_SUCCESS) - { - CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } -#endif - /**********************************************************************/ - /* Reserve Elan address space for receive buffers */ -#if MULTIRAIL_EKC - kqswnal_data.kqn_ep_rx_nmh = - ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, - EP_PERM_WRITE); - if (kqswnal_data.kqn_ep_tx_nmh == NULL) { - CERROR("Can't reserve rx dma space\n"); - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); - } -#else - dmareq.Waitfn = DDI_DMA_SLEEP; - dmareq.ElanAddr = (E3_Addr) 0; - dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; - dmareq.Perm = ELAN_PERM_REMOTEWRITE; - - rc = elan3_dma_reserve (kqswnal_data.kqn_ep->DmaState, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, - &dmareq, &kqswnal_data.kqn_eprxdmahandle); - if (rc != DDI_SUCCESS) - { - CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } -#endif - /**********************************************************************/ - /* Allocate/Initialise transmit descriptors */ - - kqswnal_data.kqn_txds = NULL; - for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) - { - int premapped_pages; - int basepage = i * KQSW_NTXMSGPAGES; - - PORTAL_ALLOC (ktx, sizeof(*ktx)); - if (ktx == NULL) { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ - ktx->ktx_alloclist = kqswnal_data.kqn_txds; - kqswnal_data.kqn_txds = ktx; - - PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); - if (ktx->ktx_buffer == NULL) - { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - /* Map pre-allocated buffer NOW, to save latency on transmit */ - premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, - KQSW_TX_BUFFER_SIZE); -#if MULTIRAIL_EKC - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &all_rails, &ktx->ktx_ebuffer); -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, - basepage, &ktx->ktx_ebuffer); -#endif - ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ - ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - - INIT_LIST_HEAD (&ktx->ktx_delayed_list); - - ktx->ktx_state = KTX_IDLE; -#if MULTIRAIL_EKC - ktx->ktx_rail = -1; /* unset rail */ -#endif - ktx->ktx_isnblk = (i >= KQSW_NTXMSGS); - list_add_tail (&ktx->ktx_list, - ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : - &kqswnal_data.kqn_idletxds); - } - - /**********************************************************************/ - /* Allocate/Initialise receive descriptors */ - kqswnal_data.kqn_rxds = NULL; - elan_page_idx = 0; - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) - { -#if MULTIRAIL_EKC - EP_NMD elanbuffer; -#else - E3_Addr elanbuffer; -#endif - int j; - - PORTAL_ALLOC(krx, sizeof(*krx)); - if (krx == NULL) { - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); - } - - memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ - krx->krx_alloclist = kqswnal_data.kqn_rxds; - kqswnal_data.kqn_rxds = krx; - - if (i < KQSW_NRXMSGS_SMALL) - { - krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; - krx->krx_eprx = kqswnal_data.kqn_eprx_small; - } - else - { - krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; - krx->krx_eprx = kqswnal_data.kqn_eprx_large; - } - - LASSERT (krx->krx_npages > 0); - for (j = 0; j < krx->krx_npages; j++) - { - struct page *page = alloc_page(GFP_KERNEL); - - if (page == NULL) { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); - } - - krx->krx_kiov[j].kiov_page = page; - LASSERT(page_address(page) != NULL); - -#if MULTIRAIL_EKC - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - page_address(page), - PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh, - elan_page_idx, &all_rails, &elanbuffer); - - if (j == 0) { - krx->krx_elanbuffer = elanbuffer; - } else { - rc = ep_nmd_merge(&krx->krx_elanbuffer, - &krx->krx_elanbuffer, - &elanbuffer); - /* NB contiguous mapping */ - LASSERT(rc); - } -#else - elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle, - page_address(page), - PAGE_SIZE, elan_page_idx, - &elanbuffer); - if (j == 0) - krx->krx_elanbuffer = elanbuffer; - - /* NB contiguous mapping */ - LASSERT (elanbuffer == krx->krx_elanbuffer + j * PAGE_SIZE); -#endif - elan_page_idx++; - - } - } - LASSERT (elan_page_idx == - (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + - (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); - - /**********************************************************************/ - /* Network interface ready to initialise */ - - my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid); - my_process_id.pid = requested_pid; - - rc = lib_init(&kqswnal_lib, nal, my_process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) - { - CERROR ("lib_init failed %d\n", rc); - kqswnal_shutdown (nal); - return (rc); - } - - kqswnal_data.kqn_init = KQN_INIT_LIB; - - /**********************************************************************/ - /* Queue receives, now that it's OK to run their completion callbacks */ - - for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { - /* NB this enqueue can allocate/sleep (attr == 0) */ - krx->krx_state = KRX_POSTED; -#if MULTIRAIL_EKC - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); -#else - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); -#endif - if (rc != EP_SUCCESS) - { - CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); - } - } - - /**********************************************************************/ - /* Spawn scheduling threads */ - for (i = 0; i < num_online_cpus(); i++) { - rc = kqswnal_thread_start (kqswnal_scheduler, NULL); - if (rc != 0) - { - CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); - } - } - - /**********************************************************************/ - /* Connect to the router */ - rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); - CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); - - rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); - } - - kqswnal_data.kqn_init = KQN_INIT_ALL; - - printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d " - "(Routing %s, initial mem %d)\n", - kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes, - kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", - pkmem); - - return (PTL_OK); -} - -void __exit -kqswnal_finalise (void) -{ -#if CONFIG_SYSCTL - if (kqswnal_tunables.kqn_sysctl != NULL) - unregister_sysctl_table (kqswnal_tunables.kqn_sysctl); -#endif - PtlNIFini(kqswnal_ni); - - ptl_unregister_nal(QSWNAL); -} - -static int __init -kqswnal_initialise (void) -{ - int rc; - - kqswnal_api.nal_ni_init = kqswnal_startup; - kqswnal_api.nal_ni_fini = kqswnal_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS; - kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; - - rc = ptl_register_nal(QSWNAL, &kqswnal_api); - if (rc != PTL_OK) { - CERROR("Can't register QSWNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways, and the workaround for 'EKC blocks forever until - * the service is active' want the NAL started up at module load - * time... */ - rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(QSWNAL); - return (-ENODEV); - } - -#if CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kqswnal_tunables.kqn_sysctl = - register_sysctl_table (kqswnal_top_ctl_table, 0); -#endif - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01"); -MODULE_LICENSE("GPL"); - -module_init (kqswnal_initialise); -module_exit (kqswnal_finalise); diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h deleted file mode 100644 index 6e04752..0000000 --- a/lustre/portals/knals/qswnal/qswnal.h +++ /dev/null @@ -1,376 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Basic library routines. - * - */ - -#ifndef _QSWNAL_H -#define _QSWNAL_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#undef printf /* nasty QSW #define */ - -#include -#include - -#if MULTIRAIL_EKC -# include -#else -# include -# include -# include -# include -# include -# include -# include -# include -# include -#endif - -#include -#include -#include -#include -#include -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#include /* wait_on_buffer */ -#else -#include /* wait_on_buffer */ -#endif -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include -#include - -#define KQSW_CHECKSUM 0 -#if KQSW_CHECKSUM -typedef unsigned long kqsw_csum_t; -#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) -#else -#define KQSW_CSUM_SIZE 0 -#endif -#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) - -/* - * Performance Tuning defines - * NB no mention of PAGE_SIZE for interoperability - */ -#define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ - -#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ - -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS (PAGE_SIZE == 4096 ? 512 : 256) /* # reserved transmit messages if can't block */ /* avoid qsnet crash b=5291 */ - -#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ - -#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ -#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ - -#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ - -#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ -#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ -#define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ - -/* - * derived constants - */ - -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) -/* The pre-allocated tx buffer (hdr + small payload) */ - -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) -/* Reserve elan address space for pre-allocated and pre-mapped transmit - * buffer and a full payload too. Extra pages allow for page alignment */ - -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) -/* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) - -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) -/* receive hdr/payload always contiguous and page aligned */ -#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) -/* biggest complete packet we can receive (or transmit) */ - -/* Remote memory descriptor */ -typedef struct -{ - __u32 kqrmd_nfrag; /* # frags */ -#if MULTIRAIL_EKC - EP_NMD kqrmd_frag[0]; /* actual frags */ -#else - EP_IOVEC kqrmd_frag[0]; /* actual frags */ -#endif -} kqswnal_remotemd_t; - -typedef struct kqswnal_rx -{ - struct list_head krx_list; /* enqueue -> thread */ - struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */ - EP_RCVR *krx_eprx; /* port to post receives to */ - EP_RXD *krx_rxd; /* receive descriptor (for repost) */ -#if MULTIRAIL_EKC - EP_NMD krx_elanbuffer; /* contiguous Elan buffer */ -#else - E3_Addr krx_elanbuffer; /* contiguous Elan buffer */ -#endif - int krx_npages; /* # pages in receive buffer */ - int krx_nob; /* Number Of Bytes received into buffer */ - int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_status; /* what status to send */ - int krx_state; /* what this RX is doing */ - atomic_t krx_refcount; /* how to tell when rpc is done */ - kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ - ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ -} kqswnal_rx_t; - -#define KRX_POSTED 1 /* receiving */ -#define KRX_PARSE 2 /* ready to be parsed */ -#define KRX_COMPLETING 3 /* waiting to be completed */ - - -typedef struct kqswnal_tx -{ - struct list_head ktx_list; /* enqueue idle/active */ - struct list_head ktx_delayed_list; /* enqueue delayedtxds */ - struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ - unsigned int ktx_isnblk:1; /* reserved descriptor? */ - unsigned int ktx_state:7; /* What I'm doing */ - unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ - uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ - int ktx_npages; /* pages reserved for mapping messages */ - int ktx_nmappedpages; /* # pages mapped for current message */ - int ktx_port; /* destination ep port */ - ptl_nid_t ktx_nid; /* destination node */ - void *ktx_args[3]; /* completion passthru */ - char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ - unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ - - /* debug/info fields */ - pid_t ktx_launcher; /* pid of launching process */ - - int ktx_nfrag; /* # message frags */ -#if MULTIRAIL_EKC - int ktx_rail; /* preferred rail */ - EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */ - EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */ -#else - E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ - EP_IOVEC ktx_frags[EP_MAXFRAG];/* msg frags (elan vaddrs) */ -#endif -} kqswnal_tx_t; - -#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_FORWARDING 1 /* sending a forwarded packet */ -#define KTX_SENDING 2 /* normal send */ -#define KTX_GETTING 3 /* sending optimised get */ -#define KTX_PUTTING 4 /* sending optimised put */ -#define KTX_RDMAING 5 /* handling optimised put/get */ - -typedef struct -{ - /* dynamic tunables... */ - int kqn_optimized_puts; /* optimized PUTs? */ - int kqn_optimized_gets; /* optimized GETs? */ -#if CONFIG_SYSCTL - struct ctl_table_header *kqn_sysctl; /* sysctl interface */ -#endif -} kqswnal_tunables_t; - -typedef struct -{ - char kqn_init; /* what's been initialised */ - char kqn_shuttingdown; /* I'm trying to shut down */ - atomic_t kqn_nthreads; /* # threads running */ - - kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ - kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ - - struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ - struct list_head kqn_activetxds; /* transmit descriptors being used */ - spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ - wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ - struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ - atomic_t kqn_pending_txs; /* # transmits being prepped */ - - spinlock_t kqn_sched_lock; /* serialise packet schedulers */ - wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ - - struct list_head kqn_readyrxds; /* rxds full of data */ - struct list_head kqn_delayedfwds; /* delayed forwards */ - struct list_head kqn_delayedtxds; /* delayed transmits */ - -#if MULTIRAIL_EKC - EP_SYS *kqn_ep; /* elan system */ - EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ - EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */ -#else - EP_DEV *kqn_ep; /* elan device */ - ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ - ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ -#endif - EP_XMTR *kqn_eptx; /* elan transmitter */ - EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ - EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ - - ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ - int kqn_nnodes; /* this cluster's size */ - int kqn_elanid; /* this nodes's elan ID */ - - EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ - EP_STATUSBLK kqn_rpc_failed; -} kqswnal_data_t; - -/* kqn_init state */ -#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KQN_INIT_DATA 1 -#define KQN_INIT_LIB 2 -#define KQN_INIT_ALL 3 - -extern lib_nal_t kqswnal_lib; -extern nal_t kqswnal_api; -extern kqswnal_tunables_t kqswnal_tunables; -extern kqswnal_data_t kqswnal_data; - -extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); -extern void kqswnal_rxhandler(EP_RXD *rxd); -extern int kqswnal_scheduler (void *); -extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kqswnal_rx_done (kqswnal_rx_t *krx); - -static inline ptl_nid_t -kqswnal_elanid2nid (int elanid) -{ - return (kqswnal_data.kqn_nid_offset + elanid); -} - -static inline int -kqswnal_nid2elanid (ptl_nid_t nid) -{ - /* not in this cluster? */ - if (nid < kqswnal_data.kqn_nid_offset || - nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes) - return (-1); - - return (nid - kqswnal_data.kqn_nid_offset); -} - -static inline ptl_nid_t -kqswnal_rx_nid(kqswnal_rx_t *krx) -{ - return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); -} - -static inline int -kqswnal_pages_spanned (void *base, int nob) -{ - unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; - unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; - - LASSERT (last_page >= first_page); /* can't wrap address space */ - return (last_page - first_page + 1); -} - -#if KQSW_CHECKSUM -static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) -{ - unsigned char *ptr = (unsigned char *)base; - - while (nob-- > 0) - sum += *ptr++; - - return (sum); -} -#endif - -static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) -{ - LASSERT (atomic_read (&krx->krx_refcount) > 0); - if (atomic_dec_and_test (&krx->krx_refcount)) - kqswnal_rx_done(krx); -} - -#if MULTIRAIL_EKC -# ifndef EP_RAILMASK_ALL -# error "old (unsupported) version of EKC headers" -# endif -#else -/* multirail defines these in */ -#define EP_MSG_SVC_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ -#define EP_MSG_SVC_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ -/* NB small/large message sizes are GLOBAL constants */ - -/* A minimal attempt to minimise inline #ifdeffing */ - -#define EP_SUCCESS ESUCCESS -#define EP_ENOMEM ENOMEM - -static inline EP_XMTR * -ep_alloc_xmtr(EP_DEV *e) -{ - return (ep_alloc_large_xmtr(e)); -} - -static inline EP_RCVR * -ep_alloc_rcvr(EP_DEV *e, int svc, int nenv) -{ - return (ep_install_large_rcvr(e, svc, nenv)); -} - -static inline void -ep_free_xmtr(EP_XMTR *x) -{ - ep_free_large_xmtr(x); -} - -static inline void -ep_free_rcvr(EP_RCVR *r) -{ - ep_remove_large_rcvr(r); -} -#endif - -#endif /* _QSWNAL_H */ diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c deleted file mode 100644 index 7aee376..0000000 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ /dev/null @@ -1,2008 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Eric Barton - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * W. Marcus Miller - Based on ksocknal - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "qswnal.h" - -/* - * LIB functions follow - * - */ -static int -kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - if (nid == nal->libnal_ni.ni_pid.nid) - *dist = 0; /* it's me */ - else if (kqswnal_nid2elanid (nid) >= 0) - *dist = 1; /* it's my peer */ - else - *dist = 2; /* via router */ - return (0); -} - -void -kqswnal_notify_peer_down(kqswnal_tx_t *ktx) -{ - struct timeval now; - time_t then; - - do_gettimeofday (&now); - then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; - - kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then); -} - -void -kqswnal_unmap_tx (kqswnal_tx_t *ktx) -{ -#if MULTIRAIL_EKC - int i; - - ktx->ktx_rail = -1; /* unset rail */ -#endif - - if (ktx->ktx_nmappedpages == 0) - return; - -#if MULTIRAIL_EKC - CDEBUG(D_NET, "%p unloading %d frags starting at %d\n", - ktx, ktx->ktx_nfrag, ktx->ktx_firsttmpfrag); - - for (i = ktx->ktx_firsttmpfrag; i < ktx->ktx_nfrag; i++) - ep_dvma_unload(kqswnal_data.kqn_ep, - kqswnal_data.kqn_ep_tx_nmh, - &ktx->ktx_frags[i]); -#else - CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", - ktx, ktx->ktx_nfrag, ktx->ktx_basepage, ktx->ktx_nmappedpages); - - LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); - LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= - kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); - - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ktx->ktx_basepage, ktx->ktx_nmappedpages); -#endif - ktx->ktx_nmappedpages = 0; -} - -int -kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov) -{ - int nfrags = ktx->ktx_nfrag; - int nmapped = ktx->ktx_nmappedpages; - int maxmapped = ktx->ktx_npages; - uint32_t basepage = ktx->ktx_basepage + nmapped; - char *ptr; -#if MULTIRAIL_EKC - EP_RAILMASK railmask; - int rail; - - if (ktx->ktx_rail < 0) - ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); - rail = ktx->ktx_rail; - if (rail < 0) { - CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); - return (-ENETDOWN); - } - railmask = 1 << rail; -#endif - LASSERT (nmapped <= maxmapped); - LASSERT (nfrags >= ktx->ktx_firsttmpfrag); - LASSERT (nfrags <= EP_MAXFRAG); - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before 'offset' */ - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = kiov->kiov_len - offset; - - /* each page frag is contained in one page */ - LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); - - if (fraglen > nob) - fraglen = nob; - - nmapped++; - if (nmapped > maxmapped) { - CERROR("Can't map message in %d pages (max %d)\n", - nmapped, maxmapped); - return (-EMSGSIZE); - } - - if (nfrags == EP_MAXFRAG) { - CERROR("Message too fragmented in Elan VM (max %d frags)\n", - EP_MAXFRAG); - return (-EMSGSIZE); - } - - /* XXX this is really crap, but we'll have to kmap until - * EKC has a page (rather than vaddr) mapping interface */ - - ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; - - CDEBUG(D_NET, - "%p[%d] loading %p for %d, page %d, %d total\n", - ktx, nfrags, ptr, fraglen, basepage, nmapped); - -#if MULTIRAIL_EKC - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - ptr, fraglen, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &railmask, &ktx->ktx_frags[nfrags]); - - if (nfrags == ktx->ktx_firsttmpfrag || - !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags])) { - /* new frag if this is the first or can't merge */ - nfrags++; - } -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ptr, fraglen, - basepage, &ktx->ktx_frags[nfrags].Base); - - if (nfrags > 0 && /* previous frag mapped */ - ktx->ktx_frags[nfrags].Base == /* contiguous with this one */ - (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len)) - /* just extend previous */ - ktx->ktx_frags[nfrags - 1].Len += fraglen; - else { - ktx->ktx_frags[nfrags].Len = fraglen; - nfrags++; /* new frag */ - } -#endif - - kunmap (kiov->kiov_page); - - /* keep in loop for failure case */ - ktx->ktx_nmappedpages = nmapped; - - basepage++; - kiov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - ktx->ktx_nfrag = nfrags; - CDEBUG (D_NET, "%p got %d frags over %d pages\n", - ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages); - - return (0); -} - -int -kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, - int niov, struct iovec *iov) -{ - int nfrags = ktx->ktx_nfrag; - int nmapped = ktx->ktx_nmappedpages; - int maxmapped = ktx->ktx_npages; - uint32_t basepage = ktx->ktx_basepage + nmapped; -#if MULTIRAIL_EKC - EP_RAILMASK railmask; - int rail; - - if (ktx->ktx_rail < 0) - ktx->ktx_rail = ep_xmtr_prefrail(kqswnal_data.kqn_eptx, - EP_RAILMASK_ALL, - kqswnal_nid2elanid(ktx->ktx_nid)); - rail = ktx->ktx_rail; - if (rail < 0) { - CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); - return (-ENETDOWN); - } - railmask = 1 << rail; -#endif - LASSERT (nmapped <= maxmapped); - LASSERT (nfrags >= ktx->ktx_firsttmpfrag); - LASSERT (nfrags <= EP_MAXFRAG); - LASSERT (niov > 0); - LASSERT (nob > 0); - - /* skip complete frags before offset */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - int fraglen = iov->iov_len - offset; - long npages; - - if (fraglen > nob) - fraglen = nob; - npages = kqswnal_pages_spanned (iov->iov_base, fraglen); - - nmapped += npages; - if (nmapped > maxmapped) { - CERROR("Can't map message in %d pages (max %d)\n", - nmapped, maxmapped); - return (-EMSGSIZE); - } - - if (nfrags == EP_MAXFRAG) { - CERROR("Message too fragmented in Elan VM (max %d frags)\n", - EP_MAXFRAG); - return (-EMSGSIZE); - } - - CDEBUG(D_NET, - "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", - ktx, nfrags, iov->iov_base + offset, fraglen, - basepage, npages, nmapped); - -#if MULTIRAIL_EKC - ep_dvma_load(kqswnal_data.kqn_ep, NULL, - iov->iov_base + offset, fraglen, - kqswnal_data.kqn_ep_tx_nmh, basepage, - &railmask, &ktx->ktx_frags[nfrags]); - - if (nfrags == ktx->ktx_firsttmpfrag || - !ep_nmd_merge(&ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags - 1], - &ktx->ktx_frags[nfrags])) { - /* new frag if this is the first or can't merge */ - nfrags++; - } -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - iov->iov_base + offset, fraglen, - basepage, &ktx->ktx_frags[nfrags].Base); - - if (nfrags > 0 && /* previous frag mapped */ - ktx->ktx_frags[nfrags].Base == /* contiguous with this one */ - (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len)) - /* just extend previous */ - ktx->ktx_frags[nfrags - 1].Len += fraglen; - else { - ktx->ktx_frags[nfrags].Len = fraglen; - nfrags++; /* new frag */ - } -#endif - - /* keep in loop for failure case */ - ktx->ktx_nmappedpages = nmapped; - - basepage += npages; - iov++; - niov--; - nob -= fraglen; - offset = 0; - - /* iov must not run out before end of data */ - LASSERT (nob == 0 || niov > 0); - - } while (nob > 0); - - ktx->ktx_nfrag = nfrags; - CDEBUG (D_NET, "%p got %d frags over %d pages\n", - ktx, ktx->ktx_nfrag, ktx->ktx_nmappedpages); - - return (0); -} - - -void -kqswnal_put_idle_tx (kqswnal_tx_t *ktx) -{ - kpr_fwd_desc_t *fwd = NULL; - unsigned long flags; - - kqswnal_unmap_tx (ktx); /* release temporary mappings */ - ktx->ktx_state = KTX_IDLE; - - spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - - list_del (&ktx->ktx_list); /* take off active list */ - - if (ktx->ktx_isnblk) { - /* reserved for non-blocking tx */ - list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds); - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - return; - } - - list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); - - /* anything blocking for a tx descriptor? */ - if (!kqswnal_data.kqn_shuttingdown && - !list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ - { - CDEBUG(D_NET,"wakeup fwd\n"); - - fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, - kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - } - - wake_up (&kqswnal_data.kqn_idletxd_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - - if (fwd == NULL) - return; - - /* schedule packet for forwarding again */ - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); -} - -kqswnal_tx_t * -kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) -{ - unsigned long flags; - kqswnal_tx_t *ktx = NULL; - - for (;;) { - spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - - if (kqswnal_data.kqn_shuttingdown) - break; - - /* "normal" descriptor is free */ - if (!list_empty (&kqswnal_data.kqn_idletxds)) { - ktx = list_entry (kqswnal_data.kqn_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } - - if (fwd != NULL) /* forwarded packet? */ - break; - - /* doing a local transmit */ - if (!may_block) { - if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { - CERROR ("intr tx desc pool exhausted\n"); - break; - } - - ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } - - /* block for idle tx */ - - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - - CDEBUG (D_NET, "blocking for tx desc\n"); - wait_event (kqswnal_data.kqn_idletxd_waitq, - !list_empty (&kqswnal_data.kqn_idletxds) || - kqswnal_data.kqn_shuttingdown); - } - - if (ktx != NULL) { - list_del (&ktx->ktx_list); - list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); - ktx->ktx_launcher = current->pid; - atomic_inc(&kqswnal_data.kqn_pending_txs); - } else if (fwd != NULL) { - /* queue forwarded packet until idle txd available */ - CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); - list_add_tail (&fwd->kprfd_list, - &kqswnal_data.kqn_idletxd_fwdq); - } - - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - - /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ - LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); - - return (ktx); -} - -void -kqswnal_tx_done (kqswnal_tx_t *ktx, int error) -{ - switch (ktx->ktx_state) { - case KTX_FORWARDING: /* router asked me to forward this packet */ - kpr_fwd_done (&kqswnal_data.kqn_router, - (kpr_fwd_desc_t *)ktx->ktx_args[0], error); - break; - - case KTX_RDMAING: /* optimized GET/PUT handled */ - case KTX_PUTTING: /* optimized PUT sent */ - case KTX_SENDING: /* normal send */ - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[1], - (error == 0) ? PTL_OK : PTL_FAIL); - break; - - case KTX_GETTING: /* optimized GET sent & REPLY received */ - /* Complete the GET with success since we can't avoid - * delivering a REPLY event; we committed to it when we - * launched the GET */ - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[1], PTL_OK); - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[2], - (error == 0) ? PTL_OK : PTL_FAIL); - break; - - default: - LASSERT (0); - } - - kqswnal_put_idle_tx (ktx); -} - -static void -kqswnal_txhandler(EP_TXD *txd, void *arg, int status) -{ - kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - - LASSERT (txd != NULL); - LASSERT (ktx != NULL); - - CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); - - if (status != EP_SUCCESS) { - - CERROR ("Tx completion to "LPX64" failed: %d\n", - ktx->ktx_nid, status); - - kqswnal_notify_peer_down(ktx); - status = -EHOSTDOWN; - - } else switch (ktx->ktx_state) { - - case KTX_GETTING: - case KTX_PUTTING: - /* RPC completed OK; but what did our peer put in the status - * block? */ -#if MULTIRAIL_EKC - status = ep_txd_statusblk(txd)->Data[0]; -#else - status = ep_txd_statusblk(txd)->Status; -#endif - break; - - case KTX_FORWARDING: - case KTX_SENDING: - status = 0; - break; - - default: - LBUG(); - break; - } - - kqswnal_tx_done (ktx, status); -} - -int -kqswnal_launch (kqswnal_tx_t *ktx) -{ - /* Don't block for transmit descriptor if we're in interrupt context */ - int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; - int dest = kqswnal_nid2elanid (ktx->ktx_nid); - unsigned long flags; - int rc; - - ktx->ktx_launchtime = jiffies; - - if (kqswnal_data.kqn_shuttingdown) - return (-ESHUTDOWN); - - LASSERT (dest >= 0); /* must be a peer */ - -#if MULTIRAIL_EKC - if (ktx->ktx_nmappedpages != 0) - attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail); -#endif - - switch (ktx->ktx_state) { - case KTX_GETTING: - case KTX_PUTTING: - /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. - * The other frags are the payload, awaiting RDMA */ - rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - NULL, ktx->ktx_frags, 1); - break; - - case KTX_FORWARDING: - case KTX_SENDING: -#if MULTIRAIL_EKC - rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - NULL, ktx->ktx_frags, ktx->ktx_nfrag); -#else - rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - ktx->ktx_frags, ktx->ktx_nfrag); -#endif - break; - - default: - LBUG(); - rc = -EINVAL; /* no compiler warning please */ - break; - } - - switch (rc) { - case EP_SUCCESS: /* success */ - return (0); - - case EP_ENOMEM: /* can't allocate ep txd => queue for later */ - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); - return (0); - - default: /* fatal error */ - CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); - kqswnal_notify_peer_down(ktx); - return (-EHOSTUNREACH); - } -} - -#if 0 -static char * -hdr_type_string (ptl_hdr_t *hdr) -{ - switch (hdr->type) { - case PTL_MSG_ACK: - return ("ACK"); - case PTL_MSG_PUT: - return ("PUT"); - case PTL_MSG_GET: - return ("GET"); - case PTL_MSG_REPLY: - return ("REPLY"); - default: - return (""); - } -} - -static void -kqswnal_cerror_hdr(ptl_hdr_t * hdr) -{ - char *type_str = hdr_type_string (hdr); - - CERROR("P3 Header at %p of type %s length %d\n", hdr, type_str, - le32_to_cpu(hdr->payload_length)); - CERROR(" From nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->src_nid), - le32_to_cpu(hdr->src_pid)); - CERROR(" To nid/pid "LPU64"/%u\n", le64_to_cpu(hdr->dest_nid), - le32_to_cpu(hdr->dest_pid)); - - switch (le32_to_cpu(hdr->type)) { - case PTL_MSG_PUT: - CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - le32_to_cpu(hdr->msg.put.ptl_index), - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - le64_to_cpu(hdr->msg.put.match_bits)); - CERROR(" offset %d, hdr data "LPX64"\n", - le32_to_cpu(hdr->msg.put.offset), - hdr->msg.put.hdr_data); - break; - - case PTL_MSG_GET: - CERROR(" Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", - le32_to_cpu(hdr->msg.get.ptl_index), - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CERROR(" Length %d, src offset %d\n", - le32_to_cpu(hdr->msg.get.sink_length), - le32_to_cpu(hdr->msg.get.src_offset)); - break; - - case PTL_MSG_ACK: - CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - le32_to_cpu(hdr->msg.ack.mlength)); - break; - - case PTL_MSG_REPLY: - CERROR(" dst md "LPX64"."LPX64"\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - } - -} /* end of print_hdr() */ -#endif - -#if !MULTIRAIL_EKC -void -kqswnal_print_eiov (int how, char *str, int n, EP_IOVEC *iov) -{ - int i; - - CDEBUG (how, "%s: %d\n", str, n); - for (i = 0; i < n; i++) { - CDEBUG (how, " %08x for %d\n", iov[i].Base, iov[i].Len); - } -} - -int -kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, - int nsrc, EP_IOVEC *src, - int ndst, EP_IOVEC *dst) -{ - int count; - int nob; - - LASSERT (ndv > 0); - LASSERT (nsrc > 0); - LASSERT (ndst > 0); - - for (count = 0; count < ndv; count++, dv++) { - - if (nsrc == 0 || ndst == 0) { - if (nsrc != ndst) { - /* For now I'll barf on any left over entries */ - CERROR ("mismatched src and dst iovs\n"); - return (-EINVAL); - } - return (count); - } - - nob = (src->Len < dst->Len) ? src->Len : dst->Len; - dv->Len = nob; - dv->Source = src->Base; - dv->Dest = dst->Base; - - if (nob >= src->Len) { - src++; - nsrc--; - } else { - src->Len -= nob; - src->Base += nob; - } - - if (nob >= dst->Len) { - dst++; - ndst--; - } else { - src->Len -= nob; - src->Base += nob; - } - } - - CERROR ("DATAVEC too small\n"); - return (-E2BIG); -} -#else -int -kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, - int nrfrag, EP_NMD *rfrag) -{ - int i; - - if (nlfrag != nrfrag) { - CERROR("Can't cope with unequal # frags: %d local %d remote\n", - nlfrag, nrfrag); - return (-EINVAL); - } - - for (i = 0; i < nlfrag; i++) - if (lfrag[i].nmd_len != rfrag[i].nmd_len) { - CERROR("Can't cope with unequal frags %d(%d):" - " %d local %d remote\n", - i, nlfrag, lfrag[i].nmd_len, rfrag[i].nmd_len); - return (-EINVAL); - } - - return (0); -} -#endif - -kqswnal_remotemd_t * -kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid) -{ - char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); - ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - ptl_nid_t nid = kqswnal_rx_nid(krx); - - /* Note (1) lib_parse has already flipped hdr. - * (2) RDMA addresses are sent in native endian-ness. When - * EKC copes with different endian nodes, I'll fix this (and - * eat my hat :) */ - - LASSERT (krx->krx_nob >= sizeof(*hdr)); - - if (hdr->type != type) { - CERROR ("Unexpected optimized get/put type %d (%d expected)" - "from "LPX64"\n", hdr->type, type, nid); - return (NULL); - } - - if (hdr->src_nid != nid) { - CERROR ("Unexpected optimized get/put source NID " - LPX64" from "LPX64"\n", hdr->src_nid, nid); - return (NULL); - } - - LASSERT (nid == expected_nid); - - if (buffer + krx->krx_nob < (char *)(rmd + 1)) { - /* msg too small to discover rmd size */ - CERROR ("Incoming message [%d] too small for RMD (%d needed)\n", - krx->krx_nob, (int)(((char *)(rmd + 1)) - buffer)); - return (NULL); - } - - if (buffer + krx->krx_nob < (char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) { - /* rmd doesn't fit in the incoming message */ - CERROR ("Incoming message [%d] too small for RMD[%d] (%d needed)\n", - krx->krx_nob, rmd->kqrmd_nfrag, - (int)(((char *)&rmd->kqrmd_frag[rmd->kqrmd_nfrag]) - buffer)); - return (NULL); - } - - return (rmd); -} - -void -kqswnal_rdma_store_complete (EP_RXD *rxd) -{ - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - - LASSERT (ktx->ktx_state == KTX_RDMAING); - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); - - krx->krx_rpc_reply_needed = 0; - kqswnal_rx_decref (krx); - - /* free ktx & finalize() its lib_msg_t */ - kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); -} - -void -kqswnal_rdma_fetch_complete (EP_RXD *rxd) -{ - /* Completed fetching the PUT data */ - int status = ep_rxd_status(rxd); - kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - unsigned long flags; - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - - LASSERT (ktx->ktx_state == KTX_RDMAING); - LASSERT (krx->krx_rxd == rxd); - /* RPC completes with failure by default */ - LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply_status != 0); - - if (status == EP_SUCCESS) { - status = krx->krx_rpc_reply_status = 0; - } else { - /* Abandon RPC since get failed */ - krx->krx_rpc_reply_needed = 0; - status = -ECONNABORTED; - } - - /* free ktx & finalize() its lib_msg_t */ - kqswnal_tx_done(ktx, status); - - if (!in_interrupt()) { - /* OK to complete the RPC now (iff I had the last ref) */ - kqswnal_rx_decref (krx); - return; - } - - LASSERT (krx->krx_state == KRX_PARSE); - krx->krx_state = KRX_COMPLETING; - - /* Complete the RPC in thread context */ - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); -} - -int -kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, - int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t len) -{ - kqswnal_remotemd_t *rmd; - kqswnal_tx_t *ktx; - int eprc; - int rc; -#if !MULTIRAIL_EKC - EP_DATAVEC datav[EP_MAXFRAG]; - int ndatav; -#endif - - LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT); - /* Not both mapped and paged payload */ - LASSERT (iov == NULL || kiov == NULL); - /* RPC completes with failure by default */ - LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply_status != 0); - - rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid); - if (rmd == NULL) - return (-EPROTO); - - if (len == 0) { - /* data got truncated to nothing. */ - lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK); - /* Let kqswnal_rx_done() complete the RPC with success */ - krx->krx_rpc_reply_status = 0; - return (0); - } - - /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not - actually sending a portals message with it */ - ktx = kqswnal_get_idle_tx(NULL, 0); - if (ktx == NULL) { - CERROR ("Can't get txd for RDMA with "LPX64"\n", - libmsg->ev.initiator.nid); - return (-ENOMEM); - } - - ktx->ktx_state = KTX_RDMAING; - ktx->ktx_nid = libmsg->ev.initiator.nid; - ktx->ktx_args[0] = krx; - ktx->ktx_args[1] = libmsg; - -#if MULTIRAIL_EKC - /* Map on the rail the RPC prefers */ - ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx, - ep_rxd_railmask(krx->krx_rxd)); -#endif - - /* Start mapping at offset 0 (we're not mapping any headers) */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - - if (kiov != NULL) - rc = kqswnal_map_tx_kiov(ktx, offset, len, niov, kiov); - else - rc = kqswnal_map_tx_iov(ktx, offset, len, niov, iov); - - if (rc != 0) { - CERROR ("Can't map local RDMA data: %d\n", rc); - goto out; - } - -#if MULTIRAIL_EKC - rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); - if (rc != 0) { - CERROR ("Incompatible RDMA descriptors\n"); - goto out; - } -#else - switch (type) { - default: - LBUG(); - - case PTL_MSG_GET: - ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, - ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); - break; - - case PTL_MSG_PUT: - ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, - rmd->kqrmd_nfrag, rmd->kqrmd_frag, - ktx->ktx_nfrag, ktx->ktx_frags); - break; - } - - if (ndatav < 0) { - CERROR ("Can't create datavec: %d\n", ndatav); - rc = ndatav; - goto out; - } -#endif - - LASSERT (atomic_read(&krx->krx_refcount) > 0); - /* Take an extra ref for the completion callback */ - atomic_inc(&krx->krx_refcount); - - switch (type) { - default: - LBUG(); - - case PTL_MSG_GET: -#if MULTIRAIL_EKC - eprc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rdma_store_complete, ktx, - &kqswnal_data.kqn_rpc_success, - ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); -#else - eprc = ep_complete_rpc (krx->krx_rxd, - kqswnal_rdma_store_complete, ktx, - &kqswnal_data.kqn_rpc_success, - datav, ndatav); - if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; -#endif - if (eprc != EP_SUCCESS) { - CERROR("can't complete RPC: %d\n", eprc); - /* don't re-attempt RPC completion */ - krx->krx_rpc_reply_needed = 0; - rc = -ECONNABORTED; - } - break; - - case PTL_MSG_PUT: -#if MULTIRAIL_EKC - eprc = ep_rpc_get (krx->krx_rxd, - kqswnal_rdma_fetch_complete, ktx, - rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); -#else - eprc = ep_rpc_get (krx->krx_rxd, - kqswnal_rdma_fetch_complete, ktx, - datav, ndatav); -#endif - if (eprc != EP_SUCCESS) { - CERROR("ep_rpc_get failed: %d\n", eprc); - /* Don't attempt RPC completion: - * EKC nuked it when the get failed */ - krx->krx_rpc_reply_needed = 0; - rc = -ECONNABORTED; - } - break; - } - - out: - if (rc != 0) { - kqswnal_rx_decref(krx); /* drop callback's ref */ - kqswnal_put_idle_tx (ktx); - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); - return (rc); -} - -static ptl_err_t -kqswnal_sendmsg (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kqswnal_tx_t *ktx; - int rc; - ptl_nid_t targetnid; -#if KQSW_CHECKSUM - int i; - kqsw_csum_t csum; - int sumoff; - int sumnob; -#endif - /* NB 1. hdr is in network byte order */ - /* 2. 'private' depends on the message type */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 - " pid %u\n", payload_nob, payload_niov, nid, pid); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* It must be OK to kmap() if required */ - LASSERT (payload_kiov == NULL || !in_interrupt ()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - if (payload_nob > KQSW_MAXPAYLOAD) { - CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", - payload_nob, KQSW_MAXPAYLOAD); - return (PTL_FAIL); - } - - if (type == PTL_MSG_REPLY && /* can I look in 'private' */ - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ - /* Must be a REPLY for an optimized GET */ - rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET, - payload_niov, payload_iov, payload_kiov, - payload_offset, payload_nob); - return ((rc == 0) ? PTL_OK : PTL_FAIL); - } - - targetnid = nid; - if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ - rc = kpr_lookup (&kqswnal_data.kqn_router, nid, - sizeof (ptl_hdr_t) + payload_nob, &targetnid); - if (rc != 0) { - CERROR("Can't route to "LPX64": router error %d\n", - nid, rc); - return (PTL_FAIL); - } - if (kqswnal_nid2elanid (targetnid) < 0) { - CERROR("Bad gateway "LPX64" for "LPX64"\n", - targetnid, nid); - return (PTL_FAIL); - } - } - - /* I may not block for a transmit descriptor if I might block the - * receiver, or an interrupt handler. */ - ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); - if (ktx == NULL) { - CERROR ("Can't get txd for msg type %d for "LPX64"\n", - type, libmsg->ev.initiator.nid); - return (PTL_NO_SPACE); - } - - ktx->ktx_state = KTX_SENDING; - ktx->ktx_nid = targetnid; - ktx->ktx_args[0] = private; - ktx->ktx_args[1] = libmsg; - ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ - - memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ - -#if KQSW_CHECKSUM - csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); - memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); - for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) { - LASSERT(i < niov); - if (payload_kiov != NULL) { - ptl_kiov_t *kiov = &payload_kiov[i]; - - if (sumoff >= kiov->kiov_len) { - sumoff -= kiov->kiov_len; - } else { - char *addr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset + sumoff; - int fragnob = kiov->kiov_len - sumoff; - - csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); - sumnob -= fragnob; - sumoff = 0; - kunmap(kiov->kiov_page); - } - } else { - struct iovec *iov = &payload_iov[i]; - - if (sumoff > iov->iov_len) { - sumoff -= iov->iov_len; - } else { - char *addr = iov->iov_base + sumoff; - int fragnob = iov->iov_len - sumoff; - - csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); - sumnob -= fragnob; - sumoff = 0; - } - } - } - memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); -#endif - - /* The first frag will be the pre-mapped buffer for (at least) the - * portals header. */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - - if (nid == targetnid && /* not forwarding */ - ((type == PTL_MSG_GET && /* optimize GET? */ - kqswnal_tunables.kqn_optimized_gets != 0 && - le32_to_cpu(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) || - (type == PTL_MSG_PUT && /* optimize PUT? */ - kqswnal_tunables.kqn_optimized_puts != 0 && - payload_nob >= kqswnal_tunables.kqn_optimized_puts))) { - lib_md_t *md = libmsg->md; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - - /* Optimised path: I send over the Elan vaddrs of the local - * buffers, and my peer DMAs directly to/from them. - * - * First I set up ktx as if it was going to send this - * payload, (it needs to map it anyway). This fills - * ktx_frags[1] and onward with the network addresses - * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my - * message. */ - - ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING; - - if ((libmsg->md->options & PTL_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, 0, md->length, - md->md_niov, md->md_iov.kiov); - else - rc = kqswnal_map_tx_iov (ktx, 0, md->length, - md->md_niov, md->md_iov.iov); - if (rc != 0) - goto out; - - rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1; - - payload_nob = offsetof(kqswnal_remotemd_t, - kqrmd_frag[rmd->kqrmd_nfrag]); - LASSERT (KQSW_HDR_SIZE + payload_nob <= KQSW_TX_BUFFER_SIZE); - -#if MULTIRAIL_EKC - memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], - rmd->kqrmd_nfrag * sizeof(EP_NMD)); - - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); -#else - memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], - rmd->kqrmd_nfrag * sizeof(EP_IOVEC)); - - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; -#endif - if (type == PTL_MSG_GET) { - /* Allocate reply message now while I'm in thread context */ - ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib, - nid, libmsg); - if (ktx->ktx_args[2] == NULL) - goto out; - - /* NB finalizing the REPLY message is my - * responsibility now, whatever happens. */ - } - - } else if (payload_nob <= KQSW_TX_MAXCONTIG) { - - /* small message: single frag copied into the pre-mapped buffer */ - -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; -#endif - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_iov, - payload_offset, payload_nob); - } - } else { - - /* large message: multiple frags: first is hdr in pre-mapped buffer */ - -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; -#endif - if (payload_kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, - payload_niov, payload_kiov); - else - rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob, - payload_niov, payload_iov); - if (rc != 0) - goto out; - } - - ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? - EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; - - rc = kqswnal_launch (ktx); - - out: - CDEBUG(rc == 0 ? D_NET : D_ERROR, - "%s "LPSZ" bytes to "LPX64" via "LPX64": rc %d\n", - rc == 0 ? "Sent" : "Failed to send", - payload_nob, nid, targetnid, rc); - - if (rc != 0) { - if (ktx->ktx_state == KTX_GETTING && - ktx->ktx_args[2] != NULL) { - /* We committed to reply, but there was a problem - * launching the GET. We can't avoid delivering a - * REPLY event since we committed above, so we - * pretend the GET succeeded but the REPLY - * failed. */ - rc = 0; - lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK); - lib_finalize (&kqswnal_lib, private, - (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL); - } - - kqswnal_put_idle_tx (ktx); - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); - return (rc == 0 ? PTL_OK : PTL_FAIL); -} - -static ptl_err_t -kqswnal_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_offset, - size_t payload_nob) -{ - return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_nob)); -} - -static ptl_err_t -kqswnal_send_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_nob)); -} - -void -kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - int rc; - kqswnal_tx_t *ktx; - ptl_kiov_t *kiov = fwd->kprfd_kiov; - int niov = fwd->kprfd_niov; - int nob = fwd->kprfd_nob; - ptl_nid_t nid = fwd->kprfd_gateway_nid; - -#if KQSW_CHECKSUM - CERROR ("checksums for forwarded packets not implemented\n"); - LBUG (); -#endif - /* The router wants this NAL to forward a packet */ - CDEBUG (D_NET, "forwarding [%p] to "LPX64", payload: %d frags %d bytes\n", - fwd, nid, niov, nob); - - ktx = kqswnal_get_idle_tx (fwd, 0); - if (ktx == NULL) /* can't get txd right now */ - return; /* fwd will be scheduled when tx desc freed */ - - if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */ - nid = fwd->kprfd_target_nid; /* target is final dest */ - - if (kqswnal_nid2elanid (nid) < 0) { - CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); - rc = -EHOSTUNREACH; - goto out; - } - - /* copy hdr into pre-mapped buffer */ - memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t)); - - ktx->ktx_port = (nob <= KQSW_SMALLPAYLOAD) ? - EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; - ktx->ktx_nid = nid; - ktx->ktx_state = KTX_FORWARDING; - ktx->ktx_args[0] = fwd; - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - - if (nob <= KQSW_TX_MAXCONTIG) - { - /* send payload from ktx's pre-mapped contiguous buffer */ -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + nob); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + nob; -#endif - if (nob > 0) - lib_copy_kiov2buf(ktx->ktx_buffer + KQSW_HDR_SIZE, - niov, kiov, 0, nob); - } - else - { - /* zero copy payload */ -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; -#endif - rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov); - if (rc != 0) - goto out; - } - - rc = kqswnal_launch (ktx); - out: - if (rc != 0) { - CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); - - /* complete now (with failure) */ - kqswnal_tx_done (ktx, rc); - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); -} - -void -kqswnal_fwd_callback (void *arg, int error) -{ - kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; - - /* The router has finished forwarding this packet */ - - if (error != 0) - { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); - - CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", - le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid),error); - } - - LASSERT (atomic_read(&krx->krx_refcount) == 1); - kqswnal_rx_decref (krx); -} - -void -kqswnal_requeue_rx (kqswnal_rx_t *krx) -{ - LASSERT (atomic_read(&krx->krx_refcount) == 0); - LASSERT (!krx->krx_rpc_reply_needed); - - krx->krx_state = KRX_POSTED; - -#if MULTIRAIL_EKC - if (kqswnal_data.kqn_shuttingdown) { - /* free EKC rxd on shutdown */ - ep_complete_receive(krx->krx_rxd); - } else { - /* repost receive */ - ep_requeue_receive(krx->krx_rxd, - kqswnal_rxhandler, krx, - &krx->krx_elanbuffer, 0); - } -#else - if (kqswnal_data.kqn_shuttingdown) - return; - - if (krx->krx_rxd == NULL) { - /* We had a failed ep_complete_rpc() which nukes the - * descriptor in "old" EKC */ - int eprc = ep_queue_receive(krx->krx_eprx, - kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); - LASSERT (eprc == EP_SUCCESS); - /* We don't handle failure here; it's incredibly rare - * (never reported?) and only happens with "old" EKC */ - } else { - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE); - } -#endif -} - -void -kqswnal_rpc_complete (EP_RXD *rxd) -{ - int status = ep_rxd_status(rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg(rxd); - - CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, - "rxd %p, krx %p, status %d\n", rxd, krx, status); - - LASSERT (krx->krx_rxd == rxd); - LASSERT (krx->krx_rpc_reply_needed); - - krx->krx_rpc_reply_needed = 0; - kqswnal_requeue_rx (krx); -} - -void -kqswnal_rx_done (kqswnal_rx_t *krx) -{ - int rc; - EP_STATUSBLK *sblk; - - LASSERT (atomic_read(&krx->krx_refcount) == 0); - - if (krx->krx_rpc_reply_needed) { - /* We've not completed the peer's RPC yet... */ - sblk = (krx->krx_rpc_reply_status == 0) ? - &kqswnal_data.kqn_rpc_success : - &kqswnal_data.kqn_rpc_failed; - - LASSERT (!in_interrupt()); -#if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - sblk, NULL, NULL, 0); - if (rc == EP_SUCCESS) - return; -#else - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - sblk, NULL, 0); - if (rc == EP_SUCCESS) - return; - - /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; -#endif - CERROR("can't complete RPC: %d\n", rc); - krx->krx_rpc_reply_needed = 0; - } - - kqswnal_requeue_rx(krx); -} - -void -kqswnal_parse (kqswnal_rx_t *krx) -{ - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); - ptl_nid_t dest_nid = le64_to_cpu(hdr->dest_nid); - int payload_nob; - int nob; - int niov; - - LASSERT (atomic_read(&krx->krx_refcount) == 1); - - if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */ - /* I ignore parse errors since I'm not consuming a byte - * stream */ - (void)lib_parse (&kqswnal_lib, hdr, krx); - - /* Drop my ref; any RDMA activity takes an additional ref */ - kqswnal_rx_decref(krx); - return; - } - -#if KQSW_CHECKSUM - LASSERTF (0, "checksums for forwarded packets not implemented\n"); -#endif - - if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ - { - CERROR("dropping packet from "LPX64" for "LPX64 - ": target is peer\n", le64_to_cpu(hdr->src_nid), dest_nid); - - kqswnal_rx_decref (krx); - return; - } - - nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE; - niov = 0; - if (nob > 0) { - krx->krx_kiov[0].kiov_offset = KQSW_HDR_SIZE; - krx->krx_kiov[0].kiov_len = MIN(PAGE_SIZE - KQSW_HDR_SIZE, nob); - niov = 1; - nob -= PAGE_SIZE - KQSW_HDR_SIZE; - - while (nob > 0) { - LASSERT (niov < krx->krx_npages); - - krx->krx_kiov[niov].kiov_offset = 0; - krx->krx_kiov[niov].kiov_len = MIN(PAGE_SIZE, nob); - niov++; - nob -= PAGE_SIZE; - } - } - - kpr_fwd_init (&krx->krx_fwd, dest_nid, - hdr, payload_nob, niov, krx->krx_kiov, - kqswnal_fwd_callback, krx); - - kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); -} - -/* Receive Interrupt Handler: posts to schedulers */ -void -kqswnal_rxhandler(EP_RXD *rxd) -{ - unsigned long flags; - int nob = ep_rxd_len (rxd); - int status = ep_rxd_status (rxd); - kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); - - CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", - rxd, krx, nob, status); - - LASSERT (krx != NULL); - LASSERT (krx->krx_state = KRX_POSTED); - - krx->krx_state = KRX_PARSE; - krx->krx_rxd = rxd; - krx->krx_nob = nob; - - /* RPC reply iff rpc request received without error */ - krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) && - (status == EP_SUCCESS || - status == EP_MSG_TOO_BIG); - - /* Default to failure if an RPC reply is requested but not handled */ - krx->krx_rpc_reply_status = -EPROTO; - atomic_set (&krx->krx_refcount, 1); - - /* must receive a whole header to be able to parse */ - if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) - { - /* receives complete with failure when receiver is removed */ -#if MULTIRAIL_EKC - if (status == EP_SHUTDOWN) - LASSERT (kqswnal_data.kqn_shuttingdown); - else - CERROR("receive status failed with status %d nob %d\n", - ep_rxd_status(rxd), nob); -#else - if (!kqswnal_data.kqn_shuttingdown) - CERROR("receive status failed with status %d nob %d\n", - ep_rxd_status(rxd), nob); -#endif - kqswnal_rx_decref(krx); - return; - } - - if (!in_interrupt()) { - kqswnal_parse(krx); - return; - } - - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); -} - -#if KQSW_CHECKSUM -void -kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) -{ - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); - - CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 - ", dpid %d, spid %d, type %d\n", - ishdr ? "Header" : "Payload", krx, - le64_to_cpu(hdr->dest_nid), le64_to_cpu(hdr->src_nid) - le32_to_cpu(hdr->dest_pid), le32_to_cpu(hdr->src_pid), - le32_to_cpu(hdr->type)); - - switch (le32_to_cpu(hdr->type)) - { - case PTL_MSG_ACK: - CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 - " len %u\n", - le32_to_cpu(hdr->msg.ack.mlength), - hdr->msg.ack.dst_wmd.handle_cookie, - hdr->msg.ack.dst_wmd.handle_idx, - le64_to_cpu(hdr->msg.ack.match_bits), - le32_to_cpu(hdr->msg.ack.length)); - break; - case PTL_MSG_PUT: - CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 - " len %u off %u data "LPX64"\n", - le32_to_cpu(hdr->msg.put.ptl_index), - hdr->msg.put.ack_wmd.handle_cookie, - hdr->msg.put.ack_wmd.handle_idx, - le64_to_cpu(hdr->msg.put.match_bits), - le32_to_cpu(hdr->msg.put.length), - le32_to_cpu(hdr->msg.put.offset), - hdr->msg.put.hdr_data); - break; - case PTL_MSG_GET: - CERROR ("GET: <>\n"); - break; - case PTL_MSG_REPLY: - CERROR ("REPLY: <>\n"); - break; - default: - CERROR ("TYPE?: <>\n"); - } -} -#endif - -static ptl_err_t -kqswnal_recvmsg (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) -{ - kqswnal_rx_t *krx = (kqswnal_rx_t *)private; - char *buffer = page_address(krx->krx_kiov[0].kiov_page); - ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; - int page; - char *page_ptr; - int page_nob; - char *iov_ptr; - int iov_nob; - int frag; - int rc; -#if KQSW_CHECKSUM - kqsw_csum_t senders_csum; - kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr)); - size_t csum_len = mlen; - int csum_frags = 0; - int csum_nob = 0; - static atomic_t csum_counter; - int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; - - atomic_inc (&csum_counter); - - memcpy (&senders_csum, buffer + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); - if (senders_csum != hdr_csum) - kqswnal_csum_error (krx, 1); -#endif - /* NB lib_parse() has already flipped *hdr */ - - CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); - - if (krx->krx_rpc_reply_needed && - hdr->type == PTL_MSG_PUT) { - /* This must be an optimized PUT */ - rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT, - niov, iov, kiov, offset, mlen); - return (rc == 0 ? PTL_OK : PTL_FAIL); - } - - /* What was actually received must be >= payload. */ - LASSERT (mlen <= rlen); - if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { - CERROR("Bad message size: have %d, need %d + %d\n", - krx->krx_nob, (int)KQSW_HDR_SIZE, (int)mlen); - return (PTL_FAIL); - } - - /* It must be OK to kmap() if required */ - LASSERT (kiov == NULL || !in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - if (mlen != 0) { - page = 0; - page_ptr = buffer + KQSW_HDR_SIZE; - page_nob = PAGE_SIZE - KQSW_HDR_SIZE; - - LASSERT (niov > 0); - - if (kiov != NULL) { - /* skip complete frags */ - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - iov_ptr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset + offset; - iov_nob = kiov->kiov_len - offset; - } else { - /* skip complete frags */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - iov_ptr = iov->iov_base + offset; - iov_nob = iov->iov_len - offset; - } - - for (;;) - { - frag = mlen; - if (frag > page_nob) - frag = page_nob; - if (frag > iov_nob) - frag = iov_nob; - - memcpy (iov_ptr, page_ptr, frag); -#if KQSW_CHECKSUM - payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); - csum_nob += frag; - csum_frags++; -#endif - mlen -= frag; - if (mlen == 0) - break; - - page_nob -= frag; - if (page_nob != 0) - page_ptr += frag; - else - { - page++; - LASSERT (page < krx->krx_npages); - page_ptr = page_address(krx->krx_kiov[page].kiov_page); - page_nob = PAGE_SIZE; - } - - iov_nob -= frag; - if (iov_nob != 0) - iov_ptr += frag; - else if (kiov != NULL) { - kunmap (kiov->kiov_page); - kiov++; - niov--; - LASSERT (niov > 0); - iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; - iov_nob = kiov->kiov_len; - } else { - iov++; - niov--; - LASSERT (niov > 0); - iov_ptr = iov->iov_base; - iov_nob = iov->iov_len; - } - } - - if (kiov != NULL) - kunmap (kiov->kiov_page); - } - -#if KQSW_CHECKSUM - memcpy (&senders_csum, buffer + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), - sizeof(kqsw_csum_t)); - - if (csum_len != rlen) - CERROR("Unable to checksum data in user's buffer\n"); - else if (senders_csum != payload_csum) - kqswnal_csum_error (krx, 0); - - if (csum_verbose) - CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " - "csum_nob %d\n", - hdr_csum, payload_csum, csum_frags, csum_nob); -#endif - lib_finalize(nal, private, libmsg, PTL_OK); - - return (PTL_OK); -} - -static ptl_err_t -kqswnal_recv(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) -{ - return (kqswnal_recvmsg(nal, private, libmsg, - niov, iov, NULL, - offset, mlen, rlen)); -} - -static ptl_err_t -kqswnal_recv_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) -{ - return (kqswnal_recvmsg(nal, private, libmsg, - niov, NULL, kiov, - offset, mlen, rlen)); -} - -int -kqswnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - - if (pid < 0) - return ((int)pid); - - atomic_inc (&kqswnal_data.kqn_nthreads); - return (0); -} - -void -kqswnal_thread_fini (void) -{ - atomic_dec (&kqswnal_data.kqn_nthreads); -} - -int -kqswnal_scheduler (void *arg) -{ - kqswnal_rx_t *krx; - kqswnal_tx_t *ktx; - kpr_fwd_desc_t *fwd; - unsigned long flags; - int rc; - int counter = 0; - int did_something; - - kportal_daemonize ("kqswnal_sched"); - kportal_blockallsigs (); - - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - for (;;) - { - did_something = 0; - - if (!list_empty (&kqswnal_data.kqn_readyrxds)) - { - krx = list_entry(kqswnal_data.kqn_readyrxds.next, - kqswnal_rx_t, krx_list); - list_del (&krx->krx_list); - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - switch (krx->krx_state) { - case KRX_PARSE: - kqswnal_parse (krx); - break; - case KRX_COMPLETING: - kqswnal_rx_decref (krx); - break; - default: - LBUG(); - } - - did_something = 1; - spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); - } - - if (!list_empty (&kqswnal_data.kqn_delayedtxds)) - { - ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, - kqswnal_tx_t, ktx_list); - list_del_init (&ktx->ktx_delayed_list); - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - rc = kqswnal_launch (ktx); - if (rc != 0) { - CERROR("Failed delayed transmit to "LPX64 - ": %d\n", ktx->ktx_nid, rc); - kqswnal_tx_done (ktx, rc); - } - atomic_dec (&kqswnal_data.kqn_pending_txs); - - did_something = 1; - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - } - - if (!list_empty (&kqswnal_data.kqn_delayedfwds)) - { - fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); - - /* If we're shutting down, this will just requeue fwd on kqn_idletxd_fwdq */ - kqswnal_fwd_packet (NULL, fwd); - - did_something = 1; - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - } - - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == KQSW_RESCHED) { - spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, - flags); - - counter = 0; - - if (!did_something) { - if (kqswnal_data.kqn_shuttingdown == 2) { - /* We only exit in stage 2 of shutdown when - * there's nothing left to do */ - break; - } - rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, - kqswnal_data.kqn_shuttingdown == 2 || - !list_empty(&kqswnal_data.kqn_readyrxds) || - !list_empty(&kqswnal_data.kqn_delayedtxds) || - !list_empty(&kqswnal_data.kqn_delayedfwds)); - LASSERT (rc == 0); - } else if (need_resched()) - schedule (); - - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - } - } - - kqswnal_thread_fini (); - return (0); -} - -lib_nal_t kqswnal_lib = -{ - libnal_data: &kqswnal_data, /* NAL private data */ - libnal_send: kqswnal_send, - libnal_send_pages: kqswnal_send_pages, - libnal_recv: kqswnal_recv, - libnal_recv_pages: kqswnal_recv_pages, - libnal_dist: kqswnal_dist -}; diff --git a/lustre/portals/knals/ranal/.cvsignore b/lustre/portals/knals/ranal/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/knals/ranal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/ranal/Makefile.in b/lustre/portals/knals/ranal/Makefile.in deleted file mode 100644 index 1772cc2..0000000 --- a/lustre/portals/knals/ranal/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := kranal -kranal-objs := ranal.o ranal_cb.o - -EXTRA_POST_CFLAGS := @RACPPFLAGS@ - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/ranal/autoMakefile.am b/lustre/portals/knals/ranal/autoMakefile.am deleted file mode 100644 index f136aa5..0000000 --- a/lustre/portals/knals/ranal/autoMakefile.am +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -if BUILD_RANAL -modulenet_DATA = kranal$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kranal-objs:%.o=%.c) ranal.h diff --git a/lustre/portals/knals/ranal/ranal.c b/lustre/portals/knals/ranal/ranal.c deleted file mode 100644 index c924827..0000000 --- a/lustre/portals/knals/ranal/ranal.c +++ /dev/null @@ -1,1983 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ -#include "ranal.h" - - -nal_t kranal_api; -ptl_handle_ni_t kranal_ni; -kra_data_t kranal_data; -kra_tunables_t kranal_tunables; - -#ifdef CONFIG_SYSCTL -#define RANAL_SYSCTL_TIMEOUT 1 -#define RANAL_SYSCTL_LISTENER_TIMEOUT 2 -#define RANAL_SYSCTL_BACKLOG 3 -#define RANAL_SYSCTL_PORT 4 -#define RANAL_SYSCTL_MAX_IMMEDIATE 5 - -#define RANAL_SYSCTL 202 - -static ctl_table kranal_ctl_table[] = { - {RANAL_SYSCTL_TIMEOUT, "timeout", - &kranal_tunables.kra_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", - &kranal_tunables.kra_listener_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_BACKLOG, "backlog", - &kranal_tunables.kra_backlog, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_PORT, "port", - &kranal_tunables.kra_port, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_MAX_IMMEDIATE, "max_immediate", - &kranal_tunables.kra_max_immediate, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table kranal_top_ctl_table[] = { - {RANAL_SYSCTL, "ranal", NULL, 0, 0555, kranal_ctl_table}, - { 0 } -}; -#endif - -int -kranal_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - - /* We've set up the socket's send buffer to be large enough for - * everything we send, so a single non-blocking send should - * complete without error. */ - - set_fs(KERNEL_DS); - rc = sock_sendmsg(sock, &msg, iov.iov_len); - set_fs(oldmm); - - return rc; -} - -int -kranal_sock_read (struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - mm_segment_t oldmm = get_fs(); - long ticks = timeout * HZ; - unsigned long then; - struct timeval tv; - - LASSERT (nob > 0); - LASSERT (ticks > 0); - - for (;;) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - /* Set receive timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = ticks / HZ, - .tv_usec = ((ticks % HZ) * 1000000) / HZ - }; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set socket recv timeout %d: %d\n", - timeout, rc); - return rc; - } - - set_fs(KERNEL_DS); - then = jiffies; - rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); - ticks -= jiffies - then; - set_fs(oldmm); - - if (rc < 0) - return rc; - - if (rc == 0) - return -ECONNABORTED; - - buffer = ((char *)buffer) + rc; - nob -= rc; - - if (nob == 0) - return 0; - - if (ticks <= 0) - return -ETIMEDOUT; - } -} - -int -kranal_create_sock(struct socket **sockp) -{ - struct socket *sock; - int rc; - struct timeval tv; - int option; - mm_segment_t oldmm = get_fs(); - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - /* Ensure sending connection info doesn't block */ - option = 2 * sizeof(kra_connreq_t); - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set send buffer %d: %d\n", option, rc); - goto failed; - } - - option = 1; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR: %d\n", rc); - goto failed; - } - - *sockp = sock; - return 0; - - failed: - sock_release(sock); - return rc; -} - -void -kranal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} - -void -kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn) -{ - RAP_RETURN rrc; - - memset(connreq, 0, sizeof(*connreq)); - - connreq->racr_magic = RANAL_MSG_MAGIC; - connreq->racr_version = RANAL_MSG_VERSION; - connreq->racr_devid = conn->rac_device->rad_id; - connreq->racr_nid = kranal_lib.libnal_ni.ni_pid.nid; - connreq->racr_timeout = conn->rac_timeout; - connreq->racr_incarnation = conn->rac_my_incarnation; - - rrc = RapkGetRiParams(conn->rac_rihandle, &connreq->racr_riparams); - LASSERT(rrc == RAP_SUCCESS); -} - -int -kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout) -{ - int i; - int rc; - - rc = kranal_sock_read(sock, connreq, sizeof(*connreq), timeout); - if (rc != 0) { - CERROR("Read failed: %d\n", rc); - return rc; - } - - if (connreq->racr_magic != RANAL_MSG_MAGIC) { - if (__swab32(connreq->racr_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x\n", connreq->racr_magic); - return -EPROTO; - } - - __swab32s(&connreq->racr_magic); - __swab16s(&connreq->racr_version); - __swab16s(&connreq->racr_devid); - __swab64s(&connreq->racr_nid); - __swab64s(&connreq->racr_incarnation); - __swab32s(&connreq->racr_timeout); - - __swab32s(&connreq->racr_riparams.FmaDomainHndl); - __swab32s(&connreq->racr_riparams.RcvCqHndl); - __swab32s(&connreq->racr_riparams.PTag); - __swab32s(&connreq->racr_riparams.CompletionCookie); - } - - if (connreq->racr_version != RANAL_MSG_VERSION) { - CERROR("Unexpected version %d\n", connreq->racr_version); - return -EPROTO; - } - - if (connreq->racr_nid == PTL_NID_ANY) { - CERROR("Received PTL_NID_ANY\n"); - return -EPROTO; - } - - if (connreq->racr_timeout < RANAL_MIN_TIMEOUT) { - CERROR("Received timeout %d < MIN %d\n", - connreq->racr_timeout, RANAL_MIN_TIMEOUT); - return -EPROTO; - } - - for (i = 0; i < kranal_data.kra_ndevs; i++) - if (connreq->racr_devid == - kranal_data.kra_devices[i].rad_id) - break; - - if (i == kranal_data.kra_ndevs) { - CERROR("Can't match device %d\n", connreq->racr_devid); - return -ENODEV; - } - - return 0; -} - -int -kranal_conn_isdup_locked(kra_peer_t *peer, __u64 incarnation) -{ - kra_conn_t *conn; - struct list_head *tmp; - int loopback = 0; - - list_for_each(tmp, &peer->rap_conns) { - conn = list_entry(tmp, kra_conn_t, rac_list); - - if (conn->rac_peer_incarnation < incarnation) { - /* Conns with an older incarnation get culled later */ - continue; - } - - if (!loopback && - conn->rac_peer_incarnation == incarnation && - peer->rap_nid == kranal_lib.libnal_ni.ni_pid.nid) { - /* loopback creates 2 conns */ - loopback = 1; - continue; - } - - return 1; - } - - return 0; -} - -void -kranal_set_conn_uniqueness (kra_conn_t *conn) -{ - unsigned long flags; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - conn->rac_my_incarnation = kranal_data.kra_next_incarnation++; - - do { /* allocate a unique cqid */ - conn->rac_cqid = kranal_data.kra_next_cqid++; - } while (kranal_cqid2conn_locked(conn->rac_cqid) != NULL); - - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); -} - -int -kranal_alloc_conn(kra_conn_t **connp, kra_device_t *dev) -{ - kra_conn_t *conn; - RAP_RETURN rrc; - - LASSERT (!in_interrupt()); - PORTAL_ALLOC(conn, sizeof(*conn)); - - if (conn == NULL) - return -ENOMEM; - - memset(conn, 0, sizeof(*conn)); - atomic_set(&conn->rac_refcount, 1); - INIT_LIST_HEAD(&conn->rac_list); - INIT_LIST_HEAD(&conn->rac_hashlist); - INIT_LIST_HEAD(&conn->rac_fmaq); - INIT_LIST_HEAD(&conn->rac_rdmaq); - INIT_LIST_HEAD(&conn->rac_replyq); - spin_lock_init(&conn->rac_lock); - - kranal_set_conn_uniqueness(conn); - - conn->rac_timeout = MAX(kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); - kranal_update_reaper_timeout(conn->rac_timeout); - - rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid, - dev->rad_ptag, - dev->rad_rdma_cq, dev->rad_fma_cq, - &conn->rac_rihandle); - if (rrc != RAP_SUCCESS) { - CERROR("RapkCreateRi failed: %d\n", rrc); - PORTAL_FREE(conn, sizeof(*conn)); - return -ENETDOWN; - } - - atomic_inc(&kranal_data.kra_nconns); - *connp = conn; - return 0; -} - -void -__kranal_conn_decref(kra_conn_t *conn) -{ - kra_tx_t *tx; - RAP_RETURN rrc; - - LASSERT (!in_interrupt()); - LASSERT (!conn->rac_scheduled); - LASSERT (list_empty(&conn->rac_list)); - LASSERT (list_empty(&conn->rac_hashlist)); - LASSERT (atomic_read(&conn->rac_refcount) == 0); - - while (!list_empty(&conn->rac_fmaq)) { - tx = list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list); - - list_del(&tx->tx_list); - kranal_tx_done(tx, -ECONNABORTED); - } - - /* We may not destroy this connection while it has RDMAs outstanding */ - LASSERT (list_empty(&conn->rac_rdmaq)); - - while (!list_empty(&conn->rac_replyq)) { - tx = list_entry(conn->rac_replyq.next, kra_tx_t, tx_list); - - list_del(&tx->tx_list); - kranal_tx_done(tx, -ECONNABORTED); - } - - rrc = RapkDestroyRi(conn->rac_device->rad_handle, - conn->rac_rihandle); - LASSERT (rrc == RAP_SUCCESS); - - if (conn->rac_peer != NULL) - kranal_peer_decref(conn->rac_peer); - - PORTAL_FREE(conn, sizeof(*conn)); - atomic_dec(&kranal_data.kra_nconns); -} - -void -kranal_terminate_conn_locked (kra_conn_t *conn) -{ - kra_peer_t *peer = conn->rac_peer; - - LASSERT (!in_interrupt()); - LASSERT (conn->rac_closing); - LASSERT (!list_empty(&conn->rac_hashlist)); - LASSERT (list_empty(&conn->rac_list)); - - /* Remove from conn hash table (no new callbacks) */ - list_del_init(&conn->rac_hashlist); - kranal_conn_decref(conn); - - /* Conn is now just waiting for remaining refs to go */ -} - -void -kranal_close_conn_locked (kra_conn_t *conn, int error) -{ - kra_peer_t *peer = conn->rac_peer; - - CDEBUG(error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->rap_nid, error); - - LASSERT (!in_interrupt()); - LASSERT (!conn->rac_closing); - LASSERT (!list_empty(&conn->rac_hashlist)); - LASSERT (!list_empty(&conn->rac_list)); - - list_del_init(&conn->rac_list); - - if (list_empty(&peer->rap_conns) && - peer->rap_persistence == 0) { - /* Non-persistent peer with no more conns... */ - kranal_unlink_peer_locked(peer); - } - - conn->rac_closing = 1; - kranal_schedule_conn(conn); - - kranal_conn_decref(conn); /* lose peer's ref */ -} - -void -kranal_close_conn (kra_conn_t *conn, int error) -{ - unsigned long flags; - - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (!conn->rac_closing) - kranal_close_conn_locked(conn, error); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); -} - -int -kranal_passive_conn_handshake (struct socket *sock, - ptl_nid_t *peer_nidp, kra_conn_t **connp) -{ - struct sockaddr_in addr; - __u32 peer_ip; - unsigned int peer_port; - kra_connreq_t connreq; - ptl_nid_t peer_nid; - kra_conn_t *conn; - kra_device_t *dev; - RAP_RETURN rrc; - int rc; - int len; - int i; - - len = sizeof(addr); - rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2); - if (rc != 0) { - CERROR("Can't get peer's IP: %d\n", rc); - return rc; - } - - peer_ip = ntohl(addr.sin_addr.s_addr); - peer_port = ntohs(addr.sin_port); - - if (peer_port >= 1024) { - CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n", - HIPQUAD(peer_ip), peer_port); - return -ECONNREFUSED; - } - - rc = kranal_recv_connreq(sock, &connreq, - kranal_tunables.kra_listener_timeout); - if (rc != 0) { - CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rc); - return rc; - } - - peer_nid = connreq.racr_nid; - LASSERT (peer_nid != PTL_NID_ANY); - - for (i = 0;;i++) { - LASSERT(i < kranal_data.kra_ndevs); - dev = &kranal_data.kra_devices[i]; - if (dev->rad_id == connreq.racr_devid) - break; - } - - rc = kranal_alloc_conn(&conn, dev); - if (rc != 0) - return rc; - - conn->rac_peer_incarnation = connreq.racr_incarnation; - conn->rac_keepalive = RANAL_TIMEOUT2KEEPALIVE(connreq.racr_timeout); - kranal_update_reaper_timeout(conn->rac_keepalive); - - rrc = RapkSetRiParams(conn->rac_rihandle, &connreq.racr_riparams); - if (rrc != RAP_SUCCESS) { - CERROR("Can't set riparams for "LPX64": %d\n", peer_nid, rrc); - kranal_conn_decref(conn); - return -EPROTO; - } - - kranal_pack_connreq(&connreq, conn); - - rc = kranal_sock_write(sock, &connreq, sizeof(connreq)); - if (rc != 0) { - CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer_ip), peer_port, rc); - kranal_conn_decref(conn); - return rc; - } - - *connp = conn; - *peer_nidp = peer_nid; - return 0; -} - -int -ranal_connect_sock(kra_peer_t *peer, struct socket **sockp) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - unsigned int port; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - - for (port = 1023; port >= 512; port--) { - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(port); - locaddr.sin_addr.s_addr = htonl(INADDR_ANY); - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (peer->rap_port); - srvaddr.sin_addr.s_addr = htonl (peer->rap_ip); - - rc = kranal_create_sock(&sock); - if (rc != 0) - return rc; - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc != 0) { - sock_release(sock); - - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", port); - continue; - } - - CERROR("Can't bind to reserved port %d: %d\n", port, rc); - return rc; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - 0); - if (rc == 0) { - *sockp = sock; - return 0; - } - - sock_release(sock); - - if (rc != -EADDRNOTAVAIL) { - CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port, rc); - return rc; - } - - CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port); - } - - /* all ports busy */ - return -EHOSTUNREACH; -} - - -int -kranal_active_conn_handshake(kra_peer_t *peer, kra_conn_t **connp) -{ - struct sockaddr_in dstaddr; - kra_connreq_t connreq; - kra_conn_t *conn; - kra_device_t *dev; - struct socket *sock; - RAP_RETURN rrc; - int rc; - int idx; - - idx = peer->rap_nid & 0x7fffffff; - dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs]; - - rc = kranal_alloc_conn(&conn, dev); - if (rc != 0) - return rc; - - kranal_pack_connreq(&connreq, conn); - - rc = ranal_connect_sock(peer, &sock); - if (rc != 0) - goto failed_0; - - /* CAVEAT EMPTOR: the passive side receives with a SHORT rx timeout - * immediately after accepting a connection, so we connect and then - * send immediately. */ - - rc = kranal_sock_write(sock, &connreq, sizeof(connreq)); - if (rc != 0) { - CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_1; - } - - rc = kranal_recv_connreq(sock, &connreq, kranal_tunables.kra_timeout); - if (rc != 0) { - CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_1; - } - - sock_release(sock); - rc = -EPROTO; - - if (connreq.racr_nid != peer->rap_nid) { - CERROR("Unexpected nid from %u.%u.%u.%u/%d: " - "received "LPX64" expected "LPX64"\n", - HIPQUAD(peer->rap_ip), peer->rap_port, - connreq.racr_nid, peer->rap_nid); - goto failed_0; - } - - if (connreq.racr_devid != dev->rad_id) { - CERROR("Unexpected device id from %u.%u.%u.%u/%d: " - "received %d expected %d\n", - HIPQUAD(peer->rap_ip), peer->rap_port, - connreq.racr_devid, dev->rad_id); - goto failed_0; - } - - conn->rac_peer_incarnation = connreq.racr_incarnation; - conn->rac_keepalive = RANAL_TIMEOUT2KEEPALIVE(connreq.racr_timeout); - kranal_update_reaper_timeout(conn->rac_keepalive); - - rc = -ENETDOWN; - rrc = RapkSetRiParams(conn->rac_rihandle, &connreq.racr_riparams); - if (rrc != RAP_SUCCESS) { - CERROR("Can't set riparams for "LPX64": %d\n", - peer->rap_nid, rrc); - goto failed_0; - } - - *connp = conn; - return 0; - - failed_1: - sock_release(sock); - failed_0: - kranal_conn_decref(conn); - return rc; -} - -int -kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) -{ - kra_peer_t *peer2; - kra_tx_t *tx; - ptl_nid_t peer_nid; - unsigned long flags; - unsigned long timeout; - kra_conn_t *conn; - int rc; - int nstale; - - if (sock != NULL) { - /* passive: listener accepted sock */ - LASSERT (peer == NULL); - - rc = kranal_passive_conn_handshake(sock, &peer_nid, &conn); - if (rc != 0) - return rc; - - /* assume this is a new peer */ - peer = kranal_create_peer(peer_nid); - if (peer == NULL) { - CERROR("Can't allocate peer for "LPX64"\n", peer_nid); - kranal_conn_decref(conn); - return -ENOMEM; - } - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - peer2 = kranal_find_peer_locked(peer_nid); - if (peer2 == NULL) { - /* peer table takes my initial ref on peer */ - list_add_tail(&peer->rap_list, - kranal_nid2peerlist(peer_nid)); - } else { - /* peer_nid already in the peer table */ - kranal_peer_decref(peer); - peer = peer2; - } - /* NB I may now have a non-persistent peer in the peer - * table with no connections: I can't drop the global lock - * until I've given it a connection or removed it, and when - * I do 'peer' can disappear under me. */ - } else { - /* active: connd wants to connect to peer */ - LASSERT (peer != NULL); - LASSERT (peer->rap_connecting); - - rc = kranal_active_conn_handshake(peer, &conn); - if (rc != 0) - return rc; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (!kranal_peer_active(peer)) { - /* raced with peer getting unlinked */ - write_unlock_irqrestore(&kranal_data.kra_global_lock, - flags); - kranal_conn_decref(conn); - return ESTALE; - } - } - - LASSERT (kranal_peer_active(peer)); /* peer is in the peer table */ - peer_nid = peer->rap_nid; - - /* Refuse to duplicate an existing connection (both sides might try - * to connect at once). NB we return success! We _do_ have a - * connection (so we don't need to remove the peer from the peer - * table) and we _don't_ have any blocked txs to complete */ - if (kranal_conn_isdup_locked(peer, conn->rac_peer_incarnation)) { - LASSERT (!list_empty(&peer->rap_conns)); - LASSERT (list_empty(&peer->rap_tx_queue)); - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - CWARN("Not creating duplicate connection to "LPX64"\n", - peer_nid); - kranal_conn_decref(conn); - return 0; - } - - kranal_peer_addref(peer); /* +1 ref for conn */ - conn->rac_peer = peer; - list_add_tail(&conn->rac_list, &peer->rap_conns); - - kranal_conn_addref(conn); /* +1 ref for conn table */ - list_add_tail(&conn->rac_hashlist, - kranal_cqid2connlist(conn->rac_cqid)); - - /* Schedule all packets blocking for a connection */ - while (!list_empty(&peer->rap_tx_queue)) { - tx = list_entry(&peer->rap_tx_queue.next, - kra_tx_t, tx_list); - - list_del(&tx->tx_list); - kranal_post_fma(conn, tx); - } - - nstale = kranal_close_stale_conns_locked(peer, conn->rac_peer_incarnation); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* CAVEAT EMPTOR: passive peer can disappear NOW */ - - if (nstale != 0) - CWARN("Closed %d stale conns to "LPX64"\n", nstale, peer_nid); - - /* Ensure conn gets checked. Transmits may have been queued and an - * FMA event may have happened before it got in the cq hash table */ - kranal_schedule_conn(conn); - return 0; -} - -void -kranal_connect (kra_peer_t *peer) -{ - kra_tx_t *tx; - unsigned long flags; - struct list_head zombies; - int rc; - - LASSERT (peer->rap_connecting); - - rc = kranal_conn_handshake(NULL, peer); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - LASSERT (peer->rap_connecting); - peer->rap_connecting = 0; - - if (rc == 0) { - /* kranal_conn_handshake() queues blocked txs immediately on - * success to avoid messages jumping the queue */ - LASSERT (list_empty(&peer->rap_tx_queue)); - - /* reset reconnection timeouts */ - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; - peer->rap_reconnect_time = CURRENT_TIME; - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - return; - } - - LASSERT (peer->rap_reconnect_interval != 0); - peer->rap_reconnect_time = CURRENT_TIME + peer->rap_reconnect_interval; - peer->rap_reconnect_interval = MAX(RANAL_MAX_RECONNECT_INTERVAL, - 1 * peer->rap_reconnect_interval); - - /* Grab all blocked packets while we have the global lock */ - list_add(&zombies, &peer->rap_tx_queue); - list_del_init(&peer->rap_tx_queue); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - if (list_empty(&zombies)) - return; - - CWARN("Dropping packets for "LPX64": connection failed\n", - peer->rap_nid); - - do { - tx = list_entry(zombies.next, kra_tx_t, tx_list); - - list_del(&tx->tx_list); - kranal_tx_done(tx, -EHOSTUNREACH); - - } while (!list_empty(&zombies)); -} - -int -kranal_listener(void *arg) -{ - struct sockaddr_in addr; - wait_queue_t wait; - struct socket *sock; - struct socket *newsock; - int port; - kra_connreq_t *connreqs; - char name[16]; - int rc; - - /* Parent thread holds kra_nid_mutex, and is, or is about to - * block on kra_listener_signal */ - - port = kranal_tunables.kra_port; - snprintf(name, sizeof(name), "kranal_lstn%03d", port); - kportal_daemonize(name); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - rc = -ENOMEM; - PORTAL_ALLOC(connreqs, 2 * sizeof(*connreqs)); - if (connreqs == NULL) - goto out_0; - - rc = kranal_create_sock(&sock); - if (rc != 0) - goto out_1; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(port); - addr.sin_addr.s_addr = INADDR_ANY; - - rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr)); - if (rc != 0) { - CERROR("Can't bind to port %d\n", port); - goto out_2; - } - - rc = sock->ops->listen(sock, kranal_tunables.kra_backlog); - if (rc != 0) { - CERROR("Can't set listen backlog %d: %d\n", - kranal_tunables.kra_backlog, rc); - goto out_2; - } - - LASSERT (kranal_data.kra_listener_sock == NULL); - kranal_data.kra_listener_sock = sock; - - /* unblock waiting parent */ - LASSERT (kranal_data.kra_listener_shutdown == 0); - up(&kranal_data.kra_listener_signal); - - /* Wake me any time something happens on my socket */ - add_wait_queue(sock->sk->sk_sleep, &wait); - - while (kranal_data.kra_listener_shutdown == 0) { - - newsock = sock_alloc(); - if (newsock == NULL) { - CERROR("Can't allocate new socket for accept\n"); - kranal_pause(HZ); - continue; - } - - set_current_state(TASK_INTERRUPTIBLE); - - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); - - if (rc == -EAGAIN && - kranal_data.kra_listener_shutdown == 0) - schedule(); - - set_current_state(TASK_RUNNING); - - if (rc != 0) { - sock_release(newsock); - if (rc != -EAGAIN) { - CERROR("Accept failed: %d\n", rc); - kranal_pause(HZ); - } - continue; - } - - kranal_conn_handshake(newsock, NULL); - sock_release(newsock); - } - - rc = 0; - remove_wait_queue(sock->sk->sk_sleep, &wait); - out_2: - sock_release(sock); - kranal_data.kra_listener_sock = NULL; - out_1: - PORTAL_FREE(connreqs, 2 * sizeof(*connreqs)); - out_0: - /* set completion status and unblock thread waiting for me - * (parent on startup failure, executioner on normal shutdown) */ - kranal_data.kra_listener_shutdown = rc; - up(&kranal_data.kra_listener_signal); - - return 0; -} - -int -kranal_start_listener (void) -{ - long pid; - int rc; - - CDEBUG(D_WARNING, "Starting listener\n"); - - /* Called holding kra_nid_mutex: listener stopped */ - LASSERT (kranal_data.kra_listener_sock == NULL); - - kranal_data.kra_listener_shutdown == 0; - pid = kernel_thread(kranal_listener, NULL, 0); - if (pid < 0) { - CERROR("Can't spawn listener: %ld\n", pid); - return (int)pid; - } - - /* Block until listener has started up. */ - down(&kranal_data.kra_listener_signal); - - rc = kranal_data.kra_listener_shutdown; - LASSERT ((rc != 0) == (kranal_data.kra_listener_sock == NULL)); - - CDEBUG(D_WARNING, "Listener %ld started OK\n", pid); - return rc; -} - -void -kranal_stop_listener(void) -{ - CDEBUG(D_WARNING, "Stopping listener\n"); - - /* Called holding kra_nid_mutex: listener running */ - LASSERT (kranal_data.kra_listener_sock != NULL); - - kranal_data.kra_listener_shutdown = 1; - wake_up_all(kranal_data.kra_listener_sock->sk->sk_sleep); - - /* Block until listener has torn down. */ - down(&kranal_data.kra_listener_signal); - - LASSERT (kranal_data.kra_listener_sock == NULL); - CDEBUG(D_WARNING, "Listener stopped\n"); -} - -int -kranal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp) -{ - int *tunable = (int *)table->data; - int old_val; - int rc; - - down(&kranal_data.kra_nid_mutex); - - LASSERT (tunable == &kranal_tunables.kra_port || - tunable == &kranal_tunables.kra_backlog); - old_val = *tunable; - - rc = proc_dointvec(table, write, filp, buffer, lenp); - - if (write && - (*tunable != old_val || - kranal_data.kra_listener_sock == NULL)) { - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(); - - rc = kranal_start_listener(); - - if (rc != 0) { - *tunable = old_val; - kranal_start_listener(); - } - } - - up(&kranal_data.kra_nid_mutex); - return rc; -} - -int -kranal_set_mynid(ptl_nid_t nid) -{ - lib_ni_t *ni = &kranal_lib.libnal_ni; - int rc = 0; - - CDEBUG(D_NET, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - down(&kranal_data.kra_nid_mutex); - - if (nid == ni->ni_pid.nid) { - /* no change of NID */ - up(&kranal_data.kra_nid_mutex); - return 0; - } - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(); - - ni->ni_pid.nid = nid; - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kranal_del_peer(PTL_NID_ANY, 0); - - if (nid != PTL_NID_ANY) - rc = kranal_start_listener(); - - up(&kranal_data.kra_nid_mutex); - return rc; -} - -kra_peer_t * -kranal_create_peer (ptl_nid_t nid) -{ - kra_peer_t *peer; - - LASSERT (nid != PTL_NID_ANY); - - PORTAL_ALLOC(peer, sizeof(*peer)); - if (peer == NULL) - return NULL; - - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ - - peer->rap_nid = nid; - atomic_set(&peer->rap_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD(&peer->rap_list); /* not in the peer table yet */ - INIT_LIST_HEAD(&peer->rap_conns); - INIT_LIST_HEAD(&peer->rap_tx_queue); - - peer->rap_reconnect_time = CURRENT_TIME; - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; - - atomic_inc(&kranal_data.kra_npeers); - return peer; -} - -void -__kranal_peer_decref (kra_peer_t *peer) -{ - CDEBUG(D_NET, "peer "LPX64" %p deleted\n", peer->rap_nid, peer); - - LASSERT (atomic_read(&peer->rap_refcount) == 0); - LASSERT (peer->rap_persistence == 0); - LASSERT (!kranal_peer_active(peer)); - LASSERT (peer->rap_connecting == 0); - LASSERT (list_empty(&peer->rap_conns)); - LASSERT (list_empty(&peer->rap_tx_queue)); - - PORTAL_FREE(peer, sizeof(*peer)); - - /* NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. */ - atomic_dec(&kranal_data.kra_npeers); -} - -kra_peer_t * -kranal_find_peer_locked (ptl_nid_t nid) -{ - struct list_head *peer_list = kranal_nid2peerlist(nid); - struct list_head *tmp; - kra_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry(tmp, kra_peer_t, rap_list); - - LASSERT (peer->rap_persistence > 0 || /* persistent peer */ - !list_empty(&peer->rap_conns)); /* active conn */ - - if (peer->rap_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read(&peer->rap_refcount)); - return peer; - } - return NULL; -} - -kra_peer_t * -kranal_find_peer (ptl_nid_t nid) -{ - kra_peer_t *peer; - - read_lock(&kranal_data.kra_global_lock); - peer = kranal_find_peer_locked(nid); - if (peer != NULL) /* +1 ref for caller? */ - kranal_peer_addref(peer); - read_unlock(&kranal_data.kra_global_lock); - - return peer; -} - -void -kranal_unlink_peer_locked (kra_peer_t *peer) -{ - LASSERT (peer->rap_persistence == 0); - LASSERT (list_empty(&peer->rap_conns)); - - LASSERT (kranal_peer_active(peer)); - list_del_init(&peer->rap_list); - - /* lose peerlist's ref */ - kranal_peer_decref(peer); -} - -int -kranal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, - int *persistencep) -{ - kra_peer_t *peer; - struct list_head *ptmp; - int i; - - read_lock(&kranal_data.kra_global_lock); - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) { - - list_for_each(ptmp, &kranal_data.kra_peers[i]) { - - peer = list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !list_empty(&peer->rap_conns)); - - if (index-- > 0) - continue; - - *nidp = peer->rap_nid; - *ipp = peer->rap_ip; - *portp = peer->rap_port; - *persistencep = peer->rap_persistence; - - read_unlock(&kranal_data.kra_global_lock); - return 0; - } - } - - read_unlock(&kranal_data.kra_global_lock); - return -ENOENT; -} - -int -kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) -{ - unsigned long flags; - kra_peer_t *peer; - kra_peer_t *peer2; - - if (nid == PTL_NID_ANY) - return -EINVAL; - - peer = kranal_create_peer(nid); - if (peer == NULL) - return -ENOMEM; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - peer2 = kranal_find_peer_locked(nid); - if (peer2 != NULL) { - kranal_peer_decref(peer); - peer = peer2; - } else { - /* peer table takes existing ref on peer */ - list_add_tail(&peer->rap_list, - kranal_nid2peerlist(nid)); - } - - peer->rap_ip = ip; - peer->rap_port = port; - peer->rap_persistence++; - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - return 0; -} - -void -kranal_del_peer_locked (kra_peer_t *peer, int single_share) -{ - struct list_head *ctmp; - struct list_head *cnxt; - kra_conn_t *conn; - - if (!single_share) - peer->rap_persistence = 0; - else if (peer->rap_persistence > 0) - peer->rap_persistence--; - - if (peer->rap_persistence != 0) - return; - - if (list_empty(&peer->rap_conns)) { - kranal_unlink_peer_locked(peer); - } else { - list_for_each_safe(ctmp, cnxt, &peer->rap_conns) { - conn = list_entry(ctmp, kra_conn_t, rac_list); - - kranal_close_conn_locked(conn, 0); - } - /* peer unlinks itself when last conn is closed */ - } -} - -int -kranal_del_peer (ptl_nid_t nid, int single_share) -{ - unsigned long flags; - struct list_head *ptmp; - struct list_head *pnxt; - kra_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; - else { - lo = 0; - hi = kranal_data.kra_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) { - peer = list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !list_empty(&peer->rap_conns)); - - if (!(nid == PTL_NID_ANY || peer->rap_nid == nid)) - continue; - - kranal_del_peer_locked(peer, single_share); - rc = 0; /* matched something */ - - if (single_share) - goto out; - } - } - out: - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - return rc; -} - -kra_conn_t * -kranal_get_conn_by_idx (int index) -{ - kra_peer_t *peer; - struct list_head *ptmp; - kra_conn_t *conn; - struct list_head *ctmp; - int i; - - read_lock (&kranal_data.kra_global_lock); - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) { - list_for_each (ptmp, &kranal_data.kra_peers[i]) { - - peer = list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !list_empty(&peer->rap_conns)); - - list_for_each (ctmp, &peer->rap_conns) { - if (index-- > 0) - continue; - - conn = list_entry(ctmp, kra_conn_t, rac_list); - CDEBUG(D_NET, "++conn[%p] -> "LPX64" (%d)\n", - conn, conn->rac_peer->rap_nid, - atomic_read(&conn->rac_refcount)); - atomic_inc(&conn->rac_refcount); - read_unlock(&kranal_data.kra_global_lock); - return conn; - } - } - } - - read_unlock(&kranal_data.kra_global_lock); - return NULL; -} - -int -kranal_close_peer_conns_locked (kra_peer_t *peer, int why) -{ - kra_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->rap_conns) { - conn = list_entry(ctmp, kra_conn_t, rac_list); - - count++; - kranal_close_conn_locked(conn, why); - } - - return count; -} - -int -kranal_close_stale_conns_locked (kra_peer_t *peer, __u64 incarnation) -{ - kra_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->rap_conns) { - conn = list_entry(ctmp, kra_conn_t, rac_list); - - if (conn->rac_peer_incarnation == incarnation) - continue; - - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->rap_nid, conn->rac_peer_incarnation, incarnation); - LASSERT (conn->rac_peer_incarnation < incarnation); - - count++; - kranal_close_conn_locked(conn, -ESTALE); - } - - return count; -} - -int -kranal_close_matching_conns (ptl_nid_t nid) -{ - unsigned long flags; - kra_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; - else { - lo = 0; - hi = kranal_data.kra_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &kranal_data.kra_peers[i]) { - - peer = list_entry(ptmp, kra_peer_t, rap_list); - LASSERT (peer->rap_persistence > 0 || - !list_empty(&peer->rap_conns)); - - if (!(nid == PTL_NID_ANY || nid == peer->rap_nid)) - continue; - - count += kranal_close_peer_conns_locked(peer, 0); - } - } - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* wildcards always succeed */ - if (nid == PTL_NID_ANY) - return 0; - - return (count == 0) ? -ENOENT : 0; -} - -int -kranal_cmd(struct portals_cfg *pcfg, void * private) -{ - int rc = -EINVAL; - - LASSERT (pcfg != NULL); - - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - __u32 ip = 0; - int port = 0; - int share_count = 0; - - rc = kranal_get_peer_info(pcfg->pcfg_count, - &nid, &ip, &port, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = kranal_add_persistent_peer(pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ - break; - } - case NAL_CMD_DEL_PEER: { - rc = kranal_del_peer(pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); - break; - } - case NAL_CMD_GET_CONN: { - kra_conn_t *conn = kranal_get_conn_by_idx(pcfg->pcfg_count); - - if (conn == NULL) - rc = -ENOENT; - else { - rc = 0; - pcfg->pcfg_nid = conn->rac_peer->rap_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kranal_conn_decref(conn); - } - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kranal_close_matching_conns(pcfg->pcfg_nid); - break; - } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) - rc = -EINVAL; - else - rc = kranal_set_mynid(pcfg->pcfg_nid); - break; - } - } - - return rc; -} - -void -kranal_free_txdescs(struct list_head *freelist) -{ - kra_tx_t *tx; - - while (!list_empty(freelist)) { - tx = list_entry(freelist->next, kra_tx_t, tx_list); - - list_del(&tx->tx_list); - PORTAL_FREE(tx->tx_phys, PTL_MD_MAX_IOV * sizeof(*tx->tx_phys)); - PORTAL_FREE(tx, sizeof(*tx)); - } -} - -int -kranal_alloc_txdescs(struct list_head *freelist, int n) -{ - int isnblk = (freelist == &kranal_data.kra_idle_nblk_txs); - int i; - kra_tx_t *tx; - - LASSERT (freelist == &kranal_data.kra_idle_txs || - freelist == &kranal_data.kra_idle_nblk_txs); - LASSERT (list_empty(freelist)); - - for (i = 0; i < n; i++) { - - PORTAL_ALLOC(tx, sizeof(*tx)); - if (tx == NULL) { - CERROR("Can't allocate %stx[%d]\n", - isnblk ? "nblk " : "", i); - kranal_free_txdescs(freelist); - return -ENOMEM; - } - - PORTAL_ALLOC(tx->tx_phys, - PTL_MD_MAX_IOV * sizeof(*tx->tx_phys)); - if (tx->tx_phys == NULL) { - CERROR("Can't allocate %stx[%d]->tx_phys\n", - isnblk ? "nblk " : "", i); - - PORTAL_FREE(tx, sizeof(*tx)); - kranal_free_txdescs(freelist); - return -ENOMEM; - } - - tx->tx_isnblk = isnblk; - tx->tx_buftype = RANAL_BUF_NONE; - - list_add(&tx->tx_list, freelist); - } - - return 0; -} - -int -kranal_device_init(int id, kra_device_t *dev) -{ - const int total_ntx = RANAL_NTX + RANAL_NTX_NBLK; - RAP_RETURN rrc; - - dev->rad_id = id; - rrc = RapkGetDeviceByIndex(id, kranal_device_callback, - &dev->rad_handle); - if (rrc != RAP_SUCCESS) { - CERROR("Can't get Rapidarray Device %d: %d\n", id, rrc); - goto failed_0; - } - - rrc = RapkReserveRdma(dev->rad_handle, total_ntx); - if (rrc != RAP_SUCCESS) { - CERROR("Can't reserve %d RDMA descriptors" - " for device %d: %d\n", total_ntx, id, rrc); - goto failed_1; - } - - rrc = RapkCreatePtag(dev->rad_handle, - &dev->rad_ptag); - if (rrc != RAP_SUCCESS) { - CERROR("Can't create ptag" - " for device %d: %d\n", id, rrc); - goto failed_1; - } - - rrc = RapkCreateCQ(dev->rad_handle, total_ntx, dev->rad_ptag, - &dev->rad_rdma_cq); - if (rrc != RAP_SUCCESS) { - CERROR("Can't create rdma cq size %d" - " for device %d: %d\n", total_ntx, id, rrc); - goto failed_2; - } - - rrc = RapkCreateCQ(dev->rad_handle, RANAL_FMA_CQ_SIZE, - dev->rad_ptag, &dev->rad_fma_cq); - if (rrc != RAP_SUCCESS) { - CERROR("Can't create fma cq size %d" - " for device %d: %d\n", RANAL_FMA_CQ_SIZE, id, rrc); - goto failed_3; - } - - return 0; - - failed_3: - RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cq, dev->rad_ptag); - failed_2: - RapkDestroyPtag(dev->rad_handle, dev->rad_ptag); - failed_1: - RapkReleaseDevice(dev->rad_handle); - failed_0: - return -ENODEV; -} - -void -kranal_device_fini(kra_device_t *dev) -{ - RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cq, dev->rad_ptag); - RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cq, dev->rad_ptag); - RapkDestroyPtag(dev->rad_handle, dev->rad_ptag); - RapkReleaseDevice(dev->rad_handle); -} - -void -kranal_api_shutdown (nal_t *nal) -{ - int i; - int rc; - unsigned long flags; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); - - LASSERT (nal == &kranal_api); - - switch (kranal_data.kra_init) { - default: - CERROR("Unexpected state %d\n", kranal_data.kra_init); - LBUG(); - - case RANAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(RANAL); - /* No new persistent peers */ - - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kranal_set_mynid(PTL_NID_ANY); - /* no new peers or conns */ - - /* Wait for all peer/conn state to clean up */ - i = 2; - while (atomic_read(&kranal_data.kra_nconns) != 0 || - atomic_read(&kranal_data.kra_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers and %d conns to close down\n", - atomic_read(&kranal_data.kra_npeers), - atomic_read(&kranal_data.kra_nconns)); - kranal_pause(HZ); - } - /* fall through */ - - case RANAL_INIT_LIB: - lib_fini(&kranal_lib); - /* fall through */ - - case RANAL_INIT_DATA: - break; - } - - /* flag threads to terminate; wake and wait for them to die */ - kranal_data.kra_shutdown = 1; - - for (i = 0; i < kranal_data.kra_ndevs; i++) { - kra_device_t *dev = &kranal_data.kra_devices[i]; - - LASSERT (list_empty(&dev->rad_connq)); - - spin_lock_irqsave(&dev->rad_lock, flags); - wake_up(&dev->rad_waitq); - spin_unlock_irqrestore(&dev->rad_lock, flags); - } - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - wake_up_all(&kranal_data.kra_reaper_waitq); - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); - - LASSERT (list_empty(&kranal_data.kra_connd_peers)); - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - wake_up_all(&kranal_data.kra_connd_waitq); - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - i = 2; - while (atomic_read(&kranal_data.kra_nthreads) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "Waiting for %d threads to terminate\n", - atomic_read(&kranal_data.kra_nthreads)); - kranal_pause(HZ); - } - - LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); - if (kranal_data.kra_peers != NULL) { - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) - LASSERT (list_empty(&kranal_data.kra_peers[i])); - - PORTAL_FREE(kranal_data.kra_peers, - sizeof (struct list_head) * - kranal_data.kra_peer_hash_size); - } - - LASSERT (atomic_read(&kranal_data.kra_nconns) == 0); - if (kranal_data.kra_conns != NULL) { - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) - LASSERT (list_empty(&kranal_data.kra_conns[i])); - - PORTAL_FREE(kranal_data.kra_conns, - sizeof (struct list_head) * - kranal_data.kra_conn_hash_size); - } - - for (i = 0; i < kranal_data.kra_ndevs; i++) - kranal_device_fini(&kranal_data.kra_devices[i]); - - kranal_free_txdescs(&kranal_data.kra_idle_txs); - kranal_free_txdescs(&kranal_data.kra_idle_nblk_txs); - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); - printk(KERN_INFO "Lustre: RapidArray NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); - - kranal_data.kra_init = RANAL_INIT_NOTHING; -} - -int -kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - static int device_ids[] = {RAPK_MAIN_DEVICE_ID, - RAPK_EXPANSION_DEVICE_ID}; - struct timeval tv; - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; - kra_device_t *dev; - - LASSERT (nal == &kranal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kranal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return PTL_OK; - } - - LASSERT (kranal_data.kra_init == RANAL_INIT_NOTHING); - - memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */ - - /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and - * a unique (for all time) incarnation so we can uniquely identify - * the sender. The incarnation is an incrementing counter - * initialised with seconds + microseconds at startup time. So we - * rely on NOT creating connections more frequently on average than - * 1MHz to ensure we don't use old incarnations when we reboot. */ - do_gettimeofday(&tv); - kranal_data.kra_next_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - init_MUTEX(&kranal_data.kra_nid_mutex); - init_MUTEX_LOCKED(&kranal_data.kra_listener_signal); - - rwlock_init(&kranal_data.kra_global_lock); - - for (i = 0; i < RANAL_MAXDEVS; i++ ) { - kra_device_t *dev = &kranal_data.kra_devices[i]; - - dev->rad_idx = i; - INIT_LIST_HEAD(&dev->rad_connq); - init_waitqueue_head(&dev->rad_waitq); - spin_lock_init(&dev->rad_lock); - } - - init_waitqueue_head(&kranal_data.kra_reaper_waitq); - spin_lock_init(&kranal_data.kra_reaper_lock); - - INIT_LIST_HEAD(&kranal_data.kra_connd_peers); - init_waitqueue_head(&kranal_data.kra_connd_waitq); - spin_lock_init(&kranal_data.kra_connd_lock); - - INIT_LIST_HEAD(&kranal_data.kra_idle_txs); - INIT_LIST_HEAD(&kranal_data.kra_idle_nblk_txs); - init_waitqueue_head(&kranal_data.kra_idle_tx_waitq); - spin_lock_init(&kranal_data.kra_tx_lock); - - /* OK to call kranal_api_shutdown() to cleanup now */ - kranal_data.kra_init = RANAL_INIT_DATA; - - kranal_data.kra_peer_hash_size = RANAL_PEER_HASH_SIZE; - PORTAL_ALLOC(kranal_data.kra_peers, - sizeof(struct list_head) * kranal_data.kra_peer_hash_size); - if (kranal_data.kra_peers == NULL) - goto failed; - - for (i = 0; i < kranal_data.kra_peer_hash_size; i++) - INIT_LIST_HEAD(&kranal_data.kra_peers[i]); - - kranal_data.kra_conn_hash_size = RANAL_PEER_HASH_SIZE; - PORTAL_ALLOC(kranal_data.kra_conns, - sizeof(struct list_head) * kranal_data.kra_conn_hash_size); - if (kranal_data.kra_conns == NULL) - goto failed; - - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) - INIT_LIST_HEAD(&kranal_data.kra_conns[i]); - - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, RANAL_NTX); - if (rc != 0) - goto failed; - - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,RANAL_NTX_NBLK); - if (rc != 0) - goto failed; - - process_id.pid = requested_pid; - process_id.nid = PTL_NID_ANY; /* don't know my NID yet */ - - rc = lib_init(&kranal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kranal_data.kra_init = RANAL_INIT_LIB; - /*****************************************************/ - - rc = kranal_thread_start(kranal_reaper, NULL); - if (rc != 0) { - CERROR("Can't spawn ranal reaper: %d\n", rc); - goto failed; - } - - for (i = 0; i < RANAL_N_CONND; i++) { - rc = kranal_thread_start(kranal_connd, (void *)i); - if (rc != 0) { - CERROR("Can't spawn ranal connd[%d]: %d\n", - i, rc); - goto failed; - } - } - - LASSERT(kranal_data.kra_ndevs == 0); - for (i = 0; i < sizeof(device_ids)/sizeof(device_ids[0]); i++) { - dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; - - rc = kranal_device_init(device_ids[i], dev); - if (rc == 0) - kranal_data.kra_ndevs++; - - rc = kranal_thread_start(kranal_scheduler, dev); - if (rc != 0) { - CERROR("Can't spawn ranal scheduler[%d]: %d\n", - i, rc); - goto failed; - } - } - - if (kranal_data.kra_ndevs == 0) - goto failed; - - rc = libcfs_nal_cmd_register(RANAL, &kranal_cmd, NULL); - if (rc != 0) { - CERROR("Can't initialise command interface (rc = %d)\n", rc); - goto failed; - } - - /* flag everything initialised */ - kranal_data.kra_init = RANAL_INIT_ALL; - /*****************************************************/ - - CDEBUG(D_MALLOC, "initial kmem %d\n", atomic_read(&portal_kmemory)); - printk(KERN_INFO "Lustre: RapidArray NAL loaded " - "(initial mem %d)\n", pkmem); - - return PTL_OK; - - failed: - kranal_api_shutdown(&kranal_api); - return PTL_FAIL; -} - -void __exit -kranal_module_fini (void) -{ -#ifdef CONFIG_SYSCTL - if (kranal_tunables.kra_sysctl != NULL) - unregister_sysctl_table(kranal_tunables.kra_sysctl); -#endif - PtlNIFini(kranal_ni); - - ptl_unregister_nal(RANAL); -} - -int __init -kranal_module_init (void) -{ - int rc; - - /* the following must be sizeof(int) for - * proc_dointvec/kranal_listener_procint() */ - LASSERT (sizeof(kranal_tunables.kra_timeout) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_listener_timeout) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_backlog) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_port) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_max_immediate) == sizeof(int)); - - kranal_api.nal_ni_init = kranal_api_startup; - kranal_api.nal_ni_fini = kranal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kranal_tunables.kra_timeout = RANAL_TIMEOUT; - - rc = ptl_register_nal(RANAL, &kranal_api); - if (rc != PTL_OK) { - CERROR("Can't register RANAL: %d\n", rc); - return -ENOMEM; /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(RANAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kranal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(RANAL); - return -ENODEV; - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kranal_tunables.kra_sysctl = - register_sysctl_table(kranal_top_ctl_table, 0); -#endif - return 0; -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel RapidArray NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init(kranal_module_init); -module_exit(kranal_module_fini); diff --git a/lustre/portals/knals/ranal/ranal.h b/lustre/portals/knals/ranal/ranal.h deleted file mode 100644 index fe130b7..0000000 --- a/lustre/portals/knals/ranal/ranal.h +++ /dev/null @@ -1,477 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include - -#include - -#if CONFIG_SMP -# define RANAL_N_SCHED num_online_cpus() /* # schedulers */ -#else -# define RANAL_N_SCHED 1 /* # schedulers */ -#endif - -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ - -#define RANAL_N_CONND 4 /* # connection daemons */ - -#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry (seconds)... */ -#define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ - -#define RANAL_FMA_PREFIX_LEN 232 /* size of FMA "Prefix" */ -#define RANAL_FMA_MAX_DATA_LEN ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ - -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ - -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ - * (overflow is a performance hit) */ - -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ -#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ - -/* default vals for runtime tunables */ -#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ -#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ -#define RANAL_MAX_IMMEDIATE (2<<10) /* biggest immediate payload */ - -typedef struct -{ - int kra_timeout; /* comms timeout (seconds) */ - int kra_listener_timeout; /* max time the listener can block */ - int kra_backlog; /* listener's backlog */ - int kra_port; /* listener's TCP/IP port */ - int kra_max_immediate; /* biggest immediate payload */ - struct ctl_table_header *kra_sysctl; /* sysctl interface */ -} kra_tunables_t; - -typedef struct -{ - RAP_PVOID rad_handle; /* device handle */ - RAP_PROTECTION_HANDLE rad_ptag; /* protection tag */ - RAP_CQ_HANDLE rad_fma_cq; /* FMA (small message) completion queue */ - RAP_CQ_HANDLE rad_rdma_cq; /* rdma completion queue */ - int rad_id; /* device id */ - int rad_idx; /* index in kra_devices */ - int rad_ready; /* set by device callback */ - struct list_head rad_connq; /* connections requiring attention */ - wait_queue_head_t rad_waitq; /* scheduler waits here */ - spinlock_t rad_lock; /* serialise */ -} kra_device_t; - -typedef struct -{ - int kra_init; /* initialisation state */ - int kra_shutdown; /* shut down? */ - atomic_t kra_nthreads; /* # live threads */ - - struct semaphore kra_nid_mutex; /* serialise NID/listener ops */ - struct semaphore kra_listener_signal; /* block for listener startup/shutdown */ - struct socket *kra_listener_sock; /* listener's socket */ - int kra_listener_shutdown; /* ask listener to close */ - - kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq etc */ - int kra_ndevs; /* # devices */ - - rwlock_t kra_global_lock; /* stabilize peer/conn ops */ - - struct list_head *kra_peers; /* hash table of all my known peers */ - int kra_peer_hash_size; /* size of kra_peers */ - atomic_t kra_npeers; /* # peers extant */ - - struct list_head *kra_conns; /* conns hashed by cqid */ - int kra_conn_hash_size; /* size of kra_conns */ - __u64 kra_next_incarnation; /* conn incarnation # generator */ - int kra_next_cqid; /* cqid generator */ - atomic_t kra_nconns; /* # connections extant */ - - long kra_new_min_timeout; /* minimum timeout on any new conn */ - wait_queue_head_t kra_reaper_waitq; /* reaper sleeps here */ - spinlock_t kra_reaper_lock; /* serialise */ - - struct list_head kra_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kra_connd_waitq; /* connection daemons sleep here */ - spinlock_t kra_connd_lock; /* serialise */ - - struct list_head kra_idle_txs; /* idle tx descriptors */ - struct list_head kra_idle_nblk_txs; /* idle reserved tx descriptors */ - __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - wait_queue_head_t kra_idle_tx_waitq; /* block here for tx descriptor */ - spinlock_t kra_tx_lock; /* serialise */ -} kra_data_t; - -#define RANAL_INIT_NOTHING 0 -#define RANAL_INIT_DATA 1 -#define RANAL_INIT_LIB 2 -#define RANAL_INIT_ALL 3 - -/************************************************************************ - * Wire message structs. These are sent in sender's byte order - * (i.e. receiver checks magic and flips if required). - */ - -typedef struct kra_connreq /* connection request/response */ -{ /* (sent via socket) */ - __u32 racr_magic; /* I'm an ranal connreq */ - __u16 racr_version; /* this is my version number */ - __u16 racr_devid; /* which device to connect on */ - __u64 racr_nid; /* my NID */ - __u64 racr_incarnation; /* my incarnation */ - __u32 racr_timeout; /* my timeout */ - RAP_RI_PARAMETERS racr_riparams; /* my endpoint info */ -} kra_connreq_t; - -typedef struct -{ - RAP_MEM_KEY rard_key; - RAP_PVOID64 rard_addr; - RAP_UINT32 rard_nob; -} kra_rdma_desc_t; - -typedef struct -{ - ptl_hdr_t raim_hdr; /* portals header */ - /* Portals payload is in FMA "Message Data" */ -} kra_immediate_msg_t; - -typedef struct -{ - ptl_hdr_t raprm_hdr; /* portals header */ - __u64 raprm_cookie; /* opaque completion cookie */ -} kra_putreq_msg_t; - -typedef struct -{ - __u64 rapam_src_cookie; /* reflected completion cookie */ - __u64 rapam_dst_cookie; /* opaque completion cookie */ - kra_rdma_desc_t rapam_desc; /* sender's sink buffer */ -} kra_putack_msg_t; - -typedef struct -{ - ptl_hdr_t ragm_hdr; /* portals header */ - __u64 ragm_cookie; /* opaque completion cookie */ - kra_rdma_desc_t ragm_desc; /* sender's sink buffer */ -} kra_get_msg_t; - -typedef struct -{ - __u64 racm_cookie; /* reflected completion cookie */ -} kra_completion_msg_t; - -typedef struct /* NB must fit in FMA "Prefix" */ -{ - __u32 ram_magic; /* I'm an ranal message */ - __u16 ram_version; /* this is my version number */ - __u16 ram_type; /* msg type */ - __u64 ram_srcnid; /* sender's NID */ - __u64 ram_incarnation; /* sender's connection incarnation */ - union { - kra_immediate_msg_t immediate; - kra_putreq_msg_t putreq; - kra_putack_msg_t putack; - kra_get_msg_t get; - kra_completion_msg_t completion; - } ram_u; - __u32 ram_seq; /* incrementing sequence number */ -} kra_msg_t; - -#define RANAL_MSG_MAGIC 0x0be91b92 /* unique magic */ -#define RANAL_MSG_VERSION 1 /* current protocol version */ - -#define RANAL_MSG_FENCE 0x80 /* fence RDMA */ - -#define RANAL_MSG_NONE 0x00 /* illegal message */ -#define RANAL_MSG_NOOP 0x01 /* empty ram_u (keepalive) */ -#define RANAL_MSG_IMMEDIATE 0x02 /* ram_u.immediate */ -#define RANAL_MSG_PUT_REQ 0x03 /* ram_u.putreq (src->sink) */ -#define RANAL_MSG_PUT_NAK 0x04 /* ram_u.completion (no PUT match: sink->src) */ -#define RANAL_MSG_PUT_ACK 0x05 /* ram_u.putack (PUT matched: sink->src) */ -#define RANAL_MSG_PUT_DONE 0x86 /* ram_u.completion (src->sink) */ -#define RANAL_MSG_GET_REQ 0x07 /* ram_u.get (sink->src) */ -#define RANAL_MSG_GET_NAK 0x08 /* ram_u.completion (no GET match: src->sink) */ -#define RANAL_MSG_GET_DONE 0x89 /* ram_u.completion (src->sink) */ -#define RANAL_MSG_CLOSE 0x8a /* empty ram_u */ - -/***********************************************************************/ - -typedef struct kra_tx /* message descriptor */ -{ - struct list_head tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */ - struct kra_conn *tx_conn; /* owning conn */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - unsigned long tx_qtime; /* when tx started to wait for something */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ - int tx_nob; /* # bytes of payload */ - int tx_buftype; /* payload buffer type */ - void *tx_buffer; /* source/sink buffer */ - int tx_phys_offset; /* first page offset (if phys) */ - int tx_phys_npages; /* # physical pages */ - RAP_PHYS_REGION *tx_phys; /* page descriptors */ - RAP_MEM_KEY tx_map_key; /* mapping key */ - RAP_RDMA_DESCRIPTOR tx_rdma_desc; /* rdma descriptor */ - __u64 tx_cookie; /* identify this tx to peer */ - kra_msg_t tx_msg; /* FMA message buffer */ -} kra_tx_t; - -#define RANAL_BUF_NONE 0 /* buffer type not set */ -#define RANAL_BUF_IMMEDIATE 1 /* immediate data */ -#define RANAL_BUF_PHYS_UNMAPPED 2 /* physical: not mapped yet */ -#define RANAL_BUF_PHYS_MAPPED 3 /* physical: mapped already */ -#define RANAL_BUF_VIRT_UNMAPPED 4 /* virtual: not mapped yet */ -#define RANAL_BUF_VIRT_MAPPED 5 /* virtual: mapped already */ - -#define RANAL_TX_IDLE 0x00 /* on freelist */ -#define RANAL_TX_SIMPLE 0x10 /* about to send a simple message */ -#define RANAL_TX_PUTI_REQ 0x20 /* PUT initiator about to send PUT_REQ */ -#define RANAL_TX_PUTI_WAIT_ACK 0x21 /* PUT initiator waiting for PUT_ACK */ -#define RANAL_TX_PUTI_RDMA 0x22 /* PUT initiator waiting for RDMA to complete */ -#define RANAL_TX_PUTI_DONE 0x23 /* PUT initiator about to send PUT_DONE */ -#define RANAL_TX_PUTT_NAK 0x30 /* PUT target about to send PUT_NAK */ -#define RANAL_TX_PUTT_ACK 0x30 /* PUT target about to send PUT_ACK */ -#define RANAL_TX_PUTT_WAIT_DONE 0x31 /* PUT target waiting for PUT_DONE */ -#define RANAL_TX_GETI_REQ 0x40 /* GET initiator about to send GET_REQ */ -#define RANAL_TX_GETI_WAIT_DONE 0x41 /* GET initiator waiting for GET_DONE */ -#define RANAL_TX_GETT_NAK 0x50 /* GET target about to send PUT_NAK */ -#define RANAL_TX_GETT_RDMA 0x51 /* GET target waiting for RDMA to complete */ -#define RANAL_TX_GETT_DONE 0x52 /* GET target about to send GET_DONE */ - -typedef struct kra_conn -{ - struct kra_peer *rac_peer; /* owning peer */ - struct list_head rac_list; /* stash on peer's conn list */ - struct list_head rac_hashlist; /* stash in connection hash table */ - struct list_head rac_schedlist; /* queue for scheduler */ - struct list_head rac_fmaq; /* txs queued for FMA */ - struct list_head rac_rdmaq; /* txs awaiting RDMA completion */ - struct list_head rac_replyq; /* txs awaiting replies */ - __u64 rac_peer_incarnation; /* peer's unique connection stamp */ - __u64 rac_my_incarnation; /* my unique connection stamp */ - unsigned long rac_last_tx; /* when I last sent an FMA message */ - unsigned long rac_last_rx; /* when I last received an FMA messages */ - long rac_keepalive; /* keepalive interval */ - long rac_timeout; /* infer peer death on (last_rx + timout > now) */ - __u32 rac_cqid; /* my completion callback id (non-unique) */ - __u32 rac_tx_seq; /* tx msg sequence number */ - __u32 rac_rx_seq; /* rx msg sequence number */ - atomic_t rac_refcount; /* # users */ - unsigned int rac_close_sent; /* I've sent CLOSE */ - unsigned int rac_close_recvd; /* I've received CLOSE */ - unsigned int rac_closing; /* connection being torn down */ - unsigned int rac_scheduled; /* being attented to */ - spinlock_t rac_lock; /* serialise */ - kra_device_t *rac_device; /* which device */ - RAP_PVOID rac_rihandle; /* RA endpoint */ - kra_msg_t *rac_rxmsg; /* incoming message (FMA prefix) */ - kra_msg_t rac_msg; /* keepalive/CLOSE message buffer */ -} kra_conn_t; - -typedef struct kra_peer -{ - struct list_head rap_list; /* stash on global peer list */ - struct list_head rap_connd_list; /* schedule on kra_connd_peers */ - struct list_head rap_conns; /* all active connections */ - struct list_head rap_tx_queue; /* msgs waiting for a conn */ - ptl_nid_t rap_nid; /* who's on the other end(s) */ - __u32 rap_ip; /* IP address of peer */ - int rap_port; /* port on which peer listens */ - atomic_t rap_refcount; /* # users */ - int rap_persistence; /* "known" peer refs */ - int rap_connecting; /* connection forming */ - unsigned long rap_reconnect_time; /* CURRENT_TIME when reconnect OK */ - unsigned long rap_reconnect_interval; /* exponential backoff */ -} kra_peer_t; - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - -extern lib_nal_t kranal_lib; -extern kra_data_t kranal_data; -extern kra_tunables_t kranal_tunables; - -extern void __kranal_peer_decref(kra_peer_t *peer); -extern void __kranal_conn_decref(kra_conn_t *conn); - -static inline void -kranal_peer_addref(kra_peer_t *peer) -{ - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); - LASSERT(atomic_read(&peer->rap_refcount) > 0); - atomic_inc(&peer->rap_refcount); -} - -static inline void -kranal_peer_decref(kra_peer_t *peer) -{ - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); - LASSERT(atomic_read(&peer->rap_refcount) > 0); - if (atomic_dec_and_test(&peer->rap_refcount)) - __kranal_peer_decref(peer); -} - -static inline struct list_head * -kranal_nid2peerlist (ptl_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size; - - return (&kranal_data.kra_peers [hash]); -} - -static inline int -kranal_peer_active(kra_peer_t *peer) -{ - /* Am I in the peer hash table? */ - return (!list_empty(&peer->rap_list)); -} - -static inline void -kranal_conn_addref(kra_conn_t *conn) -{ - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); - LASSERT(atomic_read(&conn->rac_refcount) > 0); - atomic_inc(&conn->rac_refcount); -} - -static inline void -kranal_conn_decref(kra_conn_t *conn) -{ - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); - LASSERT(atomic_read(&conn->rac_refcount) > 0); - if (atomic_dec_and_test(&conn->rac_refcount)) - __kranal_conn_decref(conn); -} - -static inline struct list_head * -kranal_cqid2connlist (__u32 cqid) -{ - unsigned int hash = cqid % kranal_data.kra_conn_hash_size; - - return (&kranal_data.kra_conns [hash]); -} - -static inline kra_conn_t * -kranal_cqid2conn_locked (__u32 cqid) -{ - struct list_head *conns = kranal_cqid2connlist(cqid); - struct list_head *tmp; - kra_conn_t *conn; - - list_for_each(tmp, conns) { - conn = list_entry(tmp, kra_conn_t, rac_hashlist); - - if (conn->rac_cqid == cqid) - return conn; - } - - return NULL; -} - -static inline int -kranal_tx_mapped (kra_tx_t *tx) -{ - return (tx->tx_buftype == RANAL_BUF_VIRT_MAPPED || - tx->tx_buftype == RANAL_BUF_PHYS_MAPPED); -} - -#if CONFIG_X86 -static inline __u64 -kranal_page2phys (struct page *p) -{ - __u64 page_number = p - mem_map; - - return (page_number << PAGE_SHIFT); -} -#else -# error "no page->phys" -#endif - -extern int kranal_listener_procint(ctl_table *table, - int write, struct file *filp, - void *buffer, size_t *lenp); -extern int kranal_close_stale_conns_locked (kra_peer_t *peer, - __u64 incarnation); -extern void kranal_update_reaper_timeout(long timeout); -extern void kranal_tx_done (kra_tx_t *tx, int completion); -extern void kranal_unlink_peer_locked (kra_peer_t *peer); -extern void kranal_schedule_conn(kra_conn_t *conn); -extern kra_peer_t *kranal_create_peer (ptl_nid_t nid); -extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid); -extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); -extern int kranal_del_peer (ptl_nid_t nid, int single_share); -extern void kranal_device_callback(RAP_INT32 devid); -extern int kranal_thread_start (int(*fn)(void *arg), void *arg); -extern int kranal_connd (void *arg); -extern int kranal_reaper (void *arg); -extern int kranal_scheduler (void *arg); -extern void kranal_close_conn_locked (kra_conn_t *conn, int error); -extern void kranal_terminate_conn_locked (kra_conn_t *conn); -extern void kranal_connect (kra_peer_t *peer); diff --git a/lustre/portals/knals/ranal/ranal_cb.c b/lustre/portals/knals/ranal/ranal_cb.c deleted file mode 100644 index 9490b56..0000000 --- a/lustre/portals/knals/ranal/ranal_cb.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "ranal.h" - -int -kranal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kranal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -void -kranal_device_callback(RAP_INT32 devid) -{ - kra_device_t *dev; - int i; - unsigned long flags; - - for (i = 0; i < kranal_data.kra_ndevs; i++) { - - dev = &kranal_data.kra_devices[i]; - if (dev->rad_id != devid) - continue; - - spin_lock_irqsave(&dev->rad_lock, flags); - - if (!dev->rad_ready) { - dev->rad_ready = 1; - wake_up(&dev->rad_waitq); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); - return; - } - - CWARN("callback for unknown device %d\n", devid); -} - -void -kranal_schedule_conn(kra_conn_t *conn) -{ - kra_device_t *dev = conn->rac_device; - unsigned long flags; - - spin_lock_irqsave(&dev->rad_lock, flags); - - if (!conn->rac_scheduled) { - kranal_conn_addref(conn); /* +1 ref for scheduler */ - conn->rac_scheduled = 1; - list_add_tail(&conn->rac_schedlist, &dev->rad_connq); - wake_up(&dev->rad_waitq); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); -} - -void -kranal_schedule_cqid (__u32 cqid) -{ - kra_conn_t *conn; - struct list_head *conns; - struct list_head *tmp; - - conns = kranal_cqid2connlist(cqid); - - read_lock(&kranal_data.kra_global_lock); - - conn = kranal_cqid2conn_locked(cqid); - - if (conn == NULL) - CWARN("no cqid %x\n", cqid); - else - kranal_schedule_conn(conn); - - read_unlock(&kranal_data.kra_global_lock); -} - -void -kranal_schedule_dev(kra_device_t *dev) -{ - kra_conn_t *conn; - struct list_head *conns; - struct list_head *tmp; - int i; - - /* Don't do this in IRQ context (servers may have 1000s of clients) */ - LASSERT (!in_interrupt()); - - CWARN("Scheduling ALL conns on device %d\n", dev->rad_id); - - for (i = 0; i < kranal_data.kra_conn_hash_size; i++) { - - /* Drop the lock on each hash bucket to ensure we don't - * block anyone for too long at IRQ priority on another CPU */ - - read_lock(&kranal_data.kra_global_lock); - - conns = &kranal_data.kra_conns[i]; - - list_for_each (tmp, conns) { - conn = list_entry(tmp, kra_conn_t, rac_hashlist); - - if (conn->rac_device == dev) - kranal_schedule_conn(conn); - } - read_unlock(&kranal_data.kra_global_lock); - } -} - -void -kranal_tx_done (kra_tx_t *tx, int completion) -{ - ptl_err_t ptlrc = (completion == 0) ? PTL_OK : PTL_FAIL; - kra_device_t *dev; - unsigned long flags; - int i; - RAP_RETURN rrc; - - LASSERT (!in_interrupt()); - - switch (tx->tx_buftype) { - default: - LBUG(); - - case RANAL_BUF_NONE: - case RANAL_BUF_IMMEDIATE: - case RANAL_BUF_PHYS_UNMAPPED: - case RANAL_BUF_VIRT_UNMAPPED: - break; - - case RANAL_BUF_PHYS_MAPPED: - LASSERT (tx->tx_conn != NULL); - dev = tx->tx_conn->rac_device; - rrc = RapkDeregisterMemory(dev->rad_handle, NULL, - dev->rad_ptag, &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - break; - - case RANAL_BUF_VIRT_MAPPED: - LASSERT (tx->tx_conn != NULL); - dev = tx->tx_conn->rac_device; - rrc = RapkDeregisterMemory(dev->rad_handle, tx->tx_buffer, - dev->rad_ptag, &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - break; - } - - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; - - lib_finalize(&kranal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } - - tx->tx_buftype = RANAL_BUF_NONE; - tx->tx_msg.ram_type = RANAL_MSG_NONE; - tx->tx_conn = NULL; - - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - - if (tx->tx_isnblk) { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_nblk_txs); - } else { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); - wake_up(&kranal_data.kra_idle_tx_waitq); - } - - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); -} - -kra_tx_t * -kranal_get_idle_tx (int may_block) -{ - unsigned long flags; - kra_tx_t *tx = NULL; - - for (;;) { - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty(&kranal_data.kra_idle_txs)) { - tx = list_entry(kranal_data.kra_idle_txs.next, - kra_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty(&kranal_data.kra_idle_nblk_txs)) { - CERROR("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry(kranal_data.kra_idle_nblk_txs.next, - kra_tx_t, tx_list); - break; - } - - /* block for idle tx */ - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - wait_event(kranal_data.kra_idle_tx_waitq, - !list_empty(&kranal_data.kra_idle_txs)); - } - - if (tx != NULL) { - list_del(&tx->tx_list); - - /* Allocate a new completion cookie. It might not be - * needed, but we've got a lock right now... */ - tx->tx_cookie = kranal_data.kra_next_tx_cookie++; - - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } - - spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - return tx; -} - -void -kranal_init_msg(kra_msg_t *msg, int type) -{ - msg->ram_magic = RANAL_MSG_MAGIC; - msg->ram_version = RANAL_MSG_VERSION; - msg->ram_type = type; - msg->ram_srcnid = kranal_lib.libnal_ni.ni_pid.nid; - /* ram_incarnation gets set when FMA is sent */ -} - -kra_tx_t * -kranal_new_tx_msg (int may_block, int type) -{ - kra_tx_t *tx = kranal_get_idle_tx(may_block); - - if (tx == NULL) - return NULL; - - kranal_init_msg(&tx->tx_msg, type); - return tx; -} - -int -kranal_setup_immediate_buffer (kra_tx_t *tx, int niov, struct iovec *iov, - int offset, int nob) - -{ - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR("Can't handle multiple vaddr fragments\n"); - return -EMSGSIZE; - } - - tx->tx_buftype = RANAL_BUF_IMMEDIATE; - tx->tx_nob = nob; - tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); - return 0; -} - -int -kranal_setup_virt_buffer (kra_tx_t *tx, int niov, struct iovec *iov, - int offset, int nob) - -{ - LASSERT (nob > 0); - LASSERT (niov > 0); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT (niov > 0); - } - - if (nob > iov->iov_len - offset) { - CERROR("Can't handle multiple vaddr fragments\n"); - return -EMSGSIZE; - } - - tx->tx_buftype = RANAL_BUF_VIRT_UNMAPPED; - tx->tx_nob = nob; - tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); - return 0; -} - -int -kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov, - int offset, int nob) -{ - RAP_PHYS_REGION *phys = tx->tx_phys; - int resid; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT (nob > 0); - LASSERT (nkiov > 0); - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - nkiov--; - kiov++; - LASSERT (nkiov > 0); - } - - tx->tx_buftype = RANAL_BUF_PHYS_UNMAPPED; - tx->tx_nob = nob; - tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset)); - - phys->Address = kranal_page2phys(kiov->kiov_page); - phys->Length = PAGE_SIZE; - phys++; - - resid = nob - (kiov->kiov_len - offset); - while (resid > 0) { - kiov++; - nkiov--; - LASSERT (nkiov > 0); - - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - int i; - /* Can't have gaps */ - CERROR("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", - phys - tx->tx_phys, - kiov->kiov_offset, kiov->kiov_len); - return -EINVAL; - } - - if ((phys - tx->tx_phys) == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", phys - tx->tx_phys); - return -EMSGSIZE; - } - - phys->Address = kranal_page2phys(kiov->kiov_page); - phys->Length = PAGE_SIZE; - phys++; - - resid -= PAGE_SIZE; - } - - tx->tx_phys_npages = phys - tx->tx_phys; - return 0; -} - -static inline int -kranal_setup_buffer (kra_tx_t *tx, int niov, - struct iovec *iov, ptl_kiov_t *kiov, - int offset, int nob) -{ - LASSERT ((iov == NULL) != (kiov == NULL)); - - if (kiov != NULL) - return kranal_setup_phys_buffer(tx, niov, kiov, offset, nob); - - return kranal_setup_virt_buffer(tx, niov, iov, offset, nob); -} - -void -kranal_map_buffer (kra_tx_t *tx) -{ - kra_conn_t *conn = tx->tx_conn; - kra_device_t *dev = conn->rac_device; - RAP_RETURN rrc; - - switch (tx->tx_buftype) { - default: - - case RANAL_BUF_PHYS_UNMAPPED: - rrc = RapkRegisterPhys(conn->rac_device->rad_handle, - tx->tx_phys, tx->tx_phys_npages, - conn->rac_device->rad_ptag, - &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - tx->tx_buftype = RANAL_BUF_PHYS_MAPPED; - return; - - case RANAL_BUF_VIRT_UNMAPPED: - rrc = RapkRegisterMemory(conn->rac_device->rad_handle, - tx->tx_buffer, tx->tx_nob, - conn->rac_device->rad_ptag, - &tx->tx_map_key); - LASSERT (rrc == RAP_SUCCESS); - tx->tx_buftype = RANAL_BUF_VIRT_MAPPED; - return; - } -} - -kra_conn_t * -kranal_find_conn_locked (kra_peer_t *peer) -{ - struct list_head *tmp; - - /* just return the first connection */ - list_for_each (tmp, &peer->rap_conns) { - return list_entry(tmp, kra_conn_t, rac_list); - } - - return NULL; -} - -void -kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx) -{ - unsigned long flags; - - tx->tx_conn = conn; - - spin_lock_irqsave(&conn->rac_lock, flags); - list_add_tail(&tx->tx_list, &conn->rac_fmaq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); - - kranal_schedule_conn(conn); -} - -void -kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) -{ - unsigned long flags; - kra_peer_t *peer; - kra_conn_t *conn; - unsigned long now; - rwlock_t *g_lock = &kranal_data.kra_global_lock; - - /* If I get here, I've committed to send, so I complete the tx with - * failure on any problems */ - - LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - - read_lock(g_lock); - - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { - read_unlock(g_lock); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } - - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - kranal_post_fma(conn, tx); - read_unlock(g_lock); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock_irqsave(g_lock, flags); - - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { - write_unlock_irqrestore(g_lock, flags); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } - - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - kranal_post_fma(conn, tx); - write_unlock_irqrestore(g_lock, flags); - return; - } - - LASSERT (peer->rap_persistence > 0); - - if (!peer->rap_connecting) { - now = CURRENT_TIME; - if (now < peer->rap_reconnect_time) { - write_unlock_irqrestore(g_lock, flags); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } - - peer->rap_connecting = 1; - kranal_peer_addref(peer); /* extra ref for connd */ - - spin_lock(&kranal_data.kra_connd_lock); - - list_add_tail(&peer->rap_connd_list, - &kranal_data.kra_connd_peers); - wake_up(&kranal_data.kra_connd_waitq); - - spin_unlock(&kranal_data.kra_connd_lock); - } - - /* A connection is being established; queue the message... */ - list_add_tail(&tx->tx_list, &peer->rap_tx_queue); - - write_unlock_irqrestore(g_lock, flags); -} - -static void -kranal_rdma(kra_tx_t *tx, int type, - kra_rdma_desc_t *rard, int nob, __u64 cookie) -{ - kra_conn_t *conn = tx->tx_conn; - RAP_RETURN rrc; - unsigned long flags; - - /* prep final completion message */ - kranal_init_msg(&tx->tx_msg, type); - tx->tx_msg.ram_u.completion.racm_cookie = cookie; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - LASSERT (nob <= rard->rard_nob); - - memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc)); - tx->tx_rdma_desc.SrcPtr.AddressBits = (__u64)((unsigned long)tx->tx_buffer); - tx->tx_rdma_desc.SrcKey = tx->tx_map_key; - tx->tx_rdma_desc.DstPtr = rard->rard_addr; - tx->tx_rdma_desc.DstKey = rard->rard_key; - tx->tx_rdma_desc.Length = nob; - tx->tx_rdma_desc.AppPtr = tx; - - if (nob == 0) { /* Immediate completion */ - kranal_post_fma(conn, tx); - return; - } - - rrc = RapkPostRdma(conn->rac_rihandle, &tx->tx_rdma_desc); - LASSERT (rrc == RAP_SUCCESS); - - spin_lock_irqsave(&conn->rac_lock, flags); - list_add_tail(&tx->tx_list, &conn->rac_rdmaq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); -} - -int -kranal_consume_rxmsg (kra_conn_t *conn, void *buffer, int nob) -{ - __u32 nob_received = nob; - RAP_RETURN rrc; - - LASSERT (conn->rac_rxmsg != NULL); - - rrc = RapkFmaCopyToUser(conn->rac_rihandle, buffer, - &nob_received, sizeof(kra_msg_t)); - LASSERT (rrc == RAP_SUCCESS); - - conn->rac_rxmsg = NULL; - - if (nob_received != nob) { - CWARN("Expected %d immediate bytes but got %d\n", - nob, nob_received); - return -EPROTO; - } - - return 0; -} - -ptl_err_t -kranal_do_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - ptl_kiov_t *kiov, - size_t offset, - size_t nob) -{ - kra_conn_t *conn; - kra_tx_t *tx; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", nob, niov, nid , pid); - - LASSERT (nob == 0 || niov > 0); - LASSERT (niov <= PTL_MD_MAX_IOV); - - LASSERT (!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch(type) { - default: - LBUG(); - - case PTL_MSG_REPLY: { - /* reply's 'private' is the conn that received the GET_REQ */ - conn = private; - LASSERT (conn->rac_rxmsg != NULL); - - if (conn->rac_rxmsg->ram_type == RANAL_MSG_IMMEDIATE) { - if (nob > RANAL_MAX_IMMEDIATE) { - CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n", - nob, nid); - return PTL_FAIL; - } - break; /* RDMA not expected */ - } - - /* Incoming message consistent with immediate reply? */ - if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) { - CERROR("REPLY to "LPX64" bad msg type %x!!!\n", - nid, conn->rac_rxmsg->ram_type); - return PTL_FAIL; - } - - tx = kranal_get_idle_tx(0); - if (tx == NULL) - return PTL_FAIL; - - rc = kranal_setup_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_conn = conn; - tx->tx_libmsg[0] = libmsg; - - kranal_map_buffer(tx); - kranal_rdma(tx, RANAL_MSG_GET_DONE, - &conn->rac_rxmsg->ram_u.get.ragm_desc, nob, - conn->rac_rxmsg->ram_u.get.ragm_cookie); - return PTL_OK; - } - - case PTL_MSG_GET: - if (kiov == NULL && /* not paged */ - nob <= RANAL_MAX_IMMEDIATE && /* small enough */ - nob <= kranal_tunables.kra_max_immediate) - break; /* send IMMEDIATE */ - - tx = kranal_new_tx_msg(0, RANAL_MSG_GET_REQ); - if (tx == NULL) - return PTL_NO_SPACE; - - rc = kranal_setup_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_libmsg[1] = lib_create_reply_msg(&kranal_lib, nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR("Can't create reply for GET to "LPX64"\n", nid); - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_libmsg[0] = libmsg; - tx->tx_msg.ram_u.get.ragm_hdr = *hdr; - /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, nid); - return PTL_OK; - - case PTL_MSG_ACK: - LASSERT (nob == 0); - break; - - case PTL_MSG_PUT: - if (kiov == NULL && /* not paged */ - nob <= RANAL_MAX_IMMEDIATE && /* small enough */ - nob <= kranal_tunables.kra_max_immediate) - break; /* send IMMEDIATE */ - - tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ); - if (tx == NULL) - return PTL_NO_SPACE; - - rc = kranal_setup_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_libmsg[0] = libmsg; - tx->tx_msg.ram_u.putreq.raprm_hdr = *hdr; - /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, nid); - return PTL_OK; - } - - LASSERT (kiov == NULL); - LASSERT (nob <= RANAL_MAX_IMMEDIATE); - - tx = kranal_new_tx_msg(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt()), - RANAL_MSG_IMMEDIATE); - if (tx == NULL) - return PTL_NO_SPACE; - - rc = kranal_setup_immediate_buffer(tx, niov, iov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_msg.ram_u.immediate.raim_hdr = *hdr; - tx->tx_libmsg[0] = libmsg; - kranal_launch_tx(tx, nid); - return PTL_OK; -} - -ptl_err_t -kranal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t len) -{ - return kranal_do_send(nal, private, cookie, - hdr, type, nid, pid, - niov, iov, NULL, - offset, len); -} - -ptl_err_t -kranal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t len) -{ - return kranal_do_send(nal, private, cookie, - hdr, type, nid, pid, - niov, NULL, kiov, - offset, len); -} - -ptl_err_t -kranal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - kra_conn_t *conn = private; - kra_msg_t *rxmsg = conn->rac_rxmsg; - kra_tx_t *tx; - void *buffer; - int rc; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); - - switch(rxmsg->ram_type) { - default: - LBUG(); - return PTL_FAIL; - - case RANAL_MSG_IMMEDIATE: - if (mlen == 0) { - buffer = NULL; - } else if (kiov != NULL) { - CERROR("Can't recv immediate into paged buffer\n"); - return PTL_FAIL; - } else { - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - if (mlen > iov->iov_len - offset) { - CERROR("Can't handle immediate frags\n"); - return PTL_FAIL; - } - buffer = ((char *)iov->iov_base) + offset; - } - rc = kranal_consume_rxmsg(conn, buffer, mlen); - lib_finalize(nal, NULL, libmsg, (rc == 0) ? PTL_OK : PTL_FAIL); - return PTL_OK; - - case RANAL_MSG_GET_REQ: - /* If the GET matched, we've already handled it in - * kranal_do_send which is called to send the REPLY. We're - * only called here to complete the GET receive (if we needed - * it which we don't, but I digress...) */ - LASSERT (libmsg == NULL); - lib_finalize(nal, NULL, libmsg, PTL_OK); - return PTL_OK; - - case RANAL_MSG_PUT_REQ: - if (libmsg == NULL) { /* PUT didn't match... */ - lib_finalize(nal, NULL, libmsg, PTL_OK); - return PTL_OK; - } - - tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_ACK); - if (tx == NULL) - return PTL_NO_SPACE; - - rc = kranal_setup_buffer(tx, niov, iov, kiov, offset, mlen); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - kranal_map_buffer(tx); - - tx->tx_msg.ram_u.putack.rapam_src_cookie = - conn->rac_rxmsg->ram_u.putreq.raprm_cookie; - tx->tx_msg.ram_u.putack.rapam_dst_cookie = tx->tx_cookie; - tx->tx_msg.ram_u.putack.rapam_desc.rard_key = tx->tx_map_key; - tx->tx_msg.ram_u.putack.rapam_desc.rard_addr.AddressBits = - (__u64)((unsigned long)tx->tx_buffer); - tx->tx_msg.ram_u.putack.rapam_desc.rard_nob = mlen; - - tx->tx_libmsg[0] = libmsg; /* finalize this on RDMA_DONE */ - - kranal_post_fma(conn, tx); - - /* flag matched by consuming rx message */ - kranal_consume_rxmsg(conn, NULL, 0); - return PTL_OK; - } -} - -ptl_err_t -kranal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return kranal_recvmsg(nal, private, msg, niov, iov, NULL, - offset, mlen, rlen); -} - -ptl_err_t -kranal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return kranal_recvmsg(nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen); -} - -int -kranal_thread_start (int(*fn)(void *arg), void *arg) -{ - long pid = kernel_thread(fn, arg, 0); - - if (pid < 0) - return(int)pid; - - atomic_inc(&kranal_data.kra_nthreads); - return 0; -} - -void -kranal_thread_fini (void) -{ - atomic_dec(&kranal_data.kra_nthreads); -} - -int -kranal_check_conn (kra_conn_t *conn) -{ - kra_tx_t *tx; - struct list_head *ttmp; - unsigned long flags; - long timeout; - unsigned long now = jiffies; - - if (!conn->rac_closing && - time_after_eq(now, conn->rac_last_tx + conn->rac_keepalive * HZ)) { - /* not sent in a while; schedule conn so scheduler sends a keepalive */ - kranal_schedule_conn(conn); - } - - /* wait twice as long for CLOSE to be sure peer is dead */ - timeout = (conn->rac_closing ? 1 : 2) * conn->rac_timeout * HZ; - - if (!conn->rac_close_recvd && - time_after_eq(now, conn->rac_last_rx + timeout)) { - CERROR("Nothing received from "LPX64" within %lu seconds\n", - conn->rac_peer->rap_nid, (now - conn->rac_last_rx)/HZ); - return -ETIMEDOUT; - } - - if (conn->rac_closing) - return 0; - - /* Check the conn's queues are moving. These are "belt+braces" checks, - * in case of hardware/software errors that make this conn seem - * responsive even though it isn't progressing its message queues. */ - - spin_lock_irqsave(&conn->rac_lock, flags); - - list_for_each (ttmp, &conn->rac_fmaq) { - tx = list_entry(ttmp, kra_tx_t, tx_list); - - if (time_after_eq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on fmaq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); - return -ETIMEDOUT; - } - } - - list_for_each (ttmp, &conn->rac_rdmaq) { - tx = list_entry(ttmp, kra_tx_t, tx_list); - - if (time_after_eq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on rdmaq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); - return -ETIMEDOUT; - } - } - - list_for_each (ttmp, &conn->rac_replyq) { - tx = list_entry(ttmp, kra_tx_t, tx_list); - - if (time_after_eq(now, tx->tx_qtime + timeout)) { - spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on replyq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); - return -ETIMEDOUT; - } - } - - spin_unlock_irqrestore(&conn->rac_lock, flags); - return 0; -} - -void -kranal_check_conns (int idx, unsigned long *min_timeoutp) -{ - struct list_head *conns = &kranal_data.kra_conns[idx]; - struct list_head *ctmp; - kra_conn_t *conn; - unsigned long flags; - int rc; - - again: - /* NB. We expect to check all the conns and not find any problems, so - * we just use a shared lock while we take a look... */ - read_lock(&kranal_data.kra_global_lock); - - list_for_each (ctmp, conns) { - conn = list_entry(ctmp, kra_conn_t, rac_hashlist); - - if (conn->rac_timeout < *min_timeoutp ) - *min_timeoutp = conn->rac_timeout; - if (conn->rac_keepalive < *min_timeoutp ) - *min_timeoutp = conn->rac_keepalive; - - rc = kranal_check_conn(conn); - if (rc == 0) - continue; - - kranal_conn_addref(conn); - read_unlock(&kranal_data.kra_global_lock); - - CERROR("Check on conn to "LPX64"failed: %d\n", - conn->rac_peer->rap_nid, rc); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (!conn->rac_closing) - kranal_close_conn_locked(conn, -ETIMEDOUT); - else - kranal_terminate_conn_locked(conn); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - kranal_conn_decref(conn); - - /* start again now I've dropped the lock */ - goto again; - } - - read_unlock(&kranal_data.kra_global_lock); -} - -int -kranal_connd (void *arg) -{ - char name[16]; - wait_queue_t wait; - unsigned long flags; - kra_peer_t *peer; - int i; - - snprintf(name, sizeof(name), "kranal_connd_%02ld", (long)arg); - kportal_daemonize(name); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - while (!kranal_data.kra_shutdown) { - /* Safe: kra_shutdown only set when quiescent */ - - if (!list_empty(&kranal_data.kra_connd_peers)) { - peer = list_entry(kranal_data.kra_connd_peers.next, - kra_peer_t, rap_connd_list); - - list_del_init(&peer->rap_connd_list); - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - kranal_connect(peer); - kranal_peer_decref(peer); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - continue; - } - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kranal_data.kra_connd_waitq, &wait); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - schedule (); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&kranal_data.kra_connd_waitq, &wait); - - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - } - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - kranal_thread_fini(); - return 0; -} - -void -kranal_update_reaper_timeout(long timeout) -{ - unsigned long flags; - - LASSERT (timeout > 0); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - if (timeout < kranal_data.kra_new_min_timeout) - kranal_data.kra_new_min_timeout = timeout; - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); -} - -int -kranal_reaper (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - kra_conn_t *conn; - kra_peer_t *peer; - long timeout; - int i; - int conn_entries = kranal_data.kra_conn_hash_size; - int conn_index = 0; - int base_index = conn_entries - 1; - unsigned long next_check_time = jiffies; - long next_min_timeout = MAX_SCHEDULE_TIMEOUT; - long current_min_timeout = 1; - - kportal_daemonize("kranal_reaper"); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - kranal_data.kra_new_min_timeout = 1; - - while (!kranal_data.kra_shutdown) { - - /* careful with the jiffy wrap... */ - timeout = (long)(next_check_time - jiffies); - if (timeout <= 0) { - - /* I wake up every 'p' seconds to check for - * timeouts on some more peers. I try to check - * every connection 'n' times within the global - * minimum of all keepalive and timeout intervals, - * to ensure I attend to every connection within - * (n+1)/n times its timeout intervals. */ - - const int p = 1; - const int n = 3; - unsigned long min_timeout; - int chunk; - - if (kranal_data.kra_new_min_timeout != MAX_SCHEDULE_TIMEOUT) { - /* new min timeout set: restart min timeout scan */ - next_min_timeout = MAX_SCHEDULE_TIMEOUT; - base_index = conn_index - 1; - if (base_index < 0) - base_index = conn_entries - 1; - - if (kranal_data.kra_new_min_timeout < current_min_timeout) { - current_min_timeout = kranal_data.kra_new_min_timeout; - CWARN("Set new min timeout %ld\n", - current_min_timeout); - } - - kranal_data.kra_new_min_timeout = MAX_SCHEDULE_TIMEOUT; - } - min_timeout = current_min_timeout; - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, - flags); - - LASSERT (min_timeout > 0); - - /* Compute how many table entries to check now so I - * get round the whole table fast enough (NB I do - * this at fixed intervals of 'p' seconds) */ - chunk = conn_entries; - if (min_timeout > n * p) - chunk = (chunk * n * p) / min_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kranal_check_conns(conn_index, - &next_min_timeout); - conn_index = (conn_index + 1) % conn_entries; - } - - next_check_time += p * HZ; - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - if (((conn_index - chunk <= base_index && - base_index < conn_index) || - (conn_index - conn_entries - chunk <= base_index && - base_index < conn_index - conn_entries))) { - - /* Scanned all conns: set current_min_timeout... */ - if (current_min_timeout != next_min_timeout) { - current_min_timeout = next_min_timeout; - CWARN("Set new min timeout %ld\n", - current_min_timeout); - } - - /* ...and restart min timeout scan */ - next_min_timeout = MAX_SCHEDULE_TIMEOUT; - base_index = conn_index - 1; - if (base_index < 0) - base_index = conn_entries - 1; - } - } - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kranal_data.kra_reaper_waitq, &wait); - - spin_unlock_irqrestore(&kranal_data.kra_reaper_lock, flags); - - schedule_timeout(timeout); - - spin_lock_irqsave(&kranal_data.kra_reaper_lock, flags); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&kranal_data.kra_reaper_waitq, &wait); - } - - kranal_thread_fini(); - return 0; -} - -void -kranal_process_rdmaq (__u32 cqid) -{ - kra_conn_t *conn; - kra_tx_t *tx; - RAP_RETURN rrc; - unsigned long flags; - RAP_RDMA_DESCRIPTOR *desc; - - read_lock(&kranal_data.kra_global_lock); - - conn = kranal_cqid2conn_locked(cqid); - LASSERT (conn != NULL); - - rrc = RapkRdmaDone(conn->rac_rihandle, &desc); - LASSERT (rrc == RAP_SUCCESS); - - spin_lock_irqsave(&conn->rac_lock, flags); - - LASSERT (!list_empty(&conn->rac_rdmaq)); - tx = list_entry(conn->rac_rdmaq.next, kra_tx_t, tx_list); - list_del(&tx->tx_list); - - LASSERT(desc->AppPtr == (void *)tx); - LASSERT(tx->tx_msg.ram_type == RANAL_MSG_PUT_DONE || - tx->tx_msg.ram_type == RANAL_MSG_GET_DONE); - - list_add_tail(&tx->tx_list, &conn->rac_fmaq); - tx->tx_qtime = jiffies; - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - /* Get conn's fmaq processed, now I've just put something there */ - kranal_schedule_conn(conn); - - read_unlock(&kranal_data.kra_global_lock); -} - -int -kranal_sendmsg(kra_conn_t *conn, kra_msg_t *msg, - void *immediate, int immediatenob) -{ - int sync = (msg->ram_type & RANAL_MSG_FENCE) != 0; - RAP_RETURN rrc; - - LASSERT (sizeof(*msg) <= RANAL_FMA_PREFIX_LEN); - LASSERT ((msg->ram_type == RANAL_MSG_IMMEDIATE) ? - immediatenob <= RANAL_FMA_MAX_DATA_LEN : - immediatenob == 0); - - msg->ram_incarnation = conn->rac_my_incarnation; - msg->ram_seq = conn->rac_tx_seq; - - if (sync) - rrc = RapkFmaSyncSend(conn->rac_device->rad_handle, - immediate, immediatenob, - msg, sizeof(*msg)); - else - rrc = RapkFmaSend(conn->rac_device->rad_handle, - immediate, immediatenob, - msg, sizeof(*msg)); - - switch (rrc) { - default: - LBUG(); - - case RAP_SUCCESS: - conn->rac_last_tx = jiffies; - conn->rac_tx_seq++; - return 0; - - case RAP_NOT_DONE: - return -EAGAIN; - } -} - -int -kranal_process_fmaq (kra_conn_t *conn) -{ - unsigned long flags; - int more_to_do; - kra_tx_t *tx; - int rc; - int expect_reply; - - /* NB I will be rescheduled some via a rad_fma_cq event if my FMA is - * out of credits when I try to send right now... */ - - if (conn->rac_closing) { - - if (!list_empty(&conn->rac_rdmaq)) { - /* Can't send CLOSE yet; I'm still waiting for RDMAs I - * posted to finish */ - LASSERT (!conn->rac_close_sent); - kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); - kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - return 0; - } - - if (conn->rac_close_sent) - return 0; - - kranal_init_msg(&conn->rac_msg, RANAL_MSG_CLOSE); - rc = kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - conn->rac_close_sent = (rc == 0); - return 0; - } - - spin_lock_irqsave(&conn->rac_lock, flags); - - if (list_empty(&conn->rac_fmaq)) { - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - if (time_after_eq(jiffies, - conn->rac_last_tx + conn->rac_keepalive)) { - kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); - kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); - } - return 0; - } - - tx = list_entry(conn->rac_fmaq.next, kra_tx_t, tx_list); - list_del(&tx->tx_list); - more_to_do = !list_empty(&conn->rac_fmaq); - - spin_unlock_irqrestore(&conn->rac_lock, flags); - - expect_reply = 0; - switch (tx->tx_msg.ram_type) { - default: - LBUG(); - - case RANAL_MSG_IMMEDIATE: - case RANAL_MSG_PUT_NAK: - case RANAL_MSG_PUT_DONE: - case RANAL_MSG_GET_NAK: - case RANAL_MSG_GET_DONE: - rc = kranal_sendmsg(conn, &tx->tx_msg, - tx->tx_buffer, tx->tx_nob); - expect_reply = 0; - break; - - case RANAL_MSG_PUT_REQ: - tx->tx_msg.ram_u.putreq.raprm_cookie = tx->tx_cookie; - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - kranal_map_buffer(tx); - expect_reply = 1; - break; - - case RANAL_MSG_PUT_ACK: - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 1; - break; - - case RANAL_MSG_GET_REQ: - kranal_map_buffer(tx); - tx->tx_msg.ram_u.get.ragm_cookie = tx->tx_cookie; - tx->tx_msg.ram_u.get.ragm_desc.rard_key = tx->tx_map_key; - tx->tx_msg.ram_u.get.ragm_desc.rard_addr.AddressBits = - (__u64)((unsigned long)tx->tx_buffer); - tx->tx_msg.ram_u.get.ragm_desc.rard_nob = tx->tx_nob; - rc = kranal_sendmsg(conn, &tx->tx_msg, NULL, 0); - expect_reply = 1; - break; - } - - if (rc == -EAGAIN) { - /* replace at the head of the list for later */ - spin_lock_irqsave(&conn->rac_lock, flags); - list_add(&tx->tx_list, &conn->rac_fmaq); - spin_unlock_irqrestore(&conn->rac_lock, flags); - - return 0; - } - - LASSERT (rc == 0); - - if (!expect_reply) { - kranal_tx_done(tx, 0); - } else { - spin_lock_irqsave(&conn->rac_lock, flags); - list_add_tail(&tx->tx_list, &conn->rac_replyq); - tx->tx_qtime = jiffies; - spin_unlock_irqrestore(&conn->rac_lock, flags); - } - - return more_to_do; -} - -static inline void -kranal_swab_rdma_desc (kra_rdma_desc_t *d) -{ - __swab64s(&d->rard_key.Key); - __swab16s(&d->rard_key.Cookie); - __swab16s(&d->rard_key.MdHandle); - __swab32s(&d->rard_key.Flags); - __swab64s(&d->rard_addr.AddressBits); - __swab32s(&d->rard_nob); -} - -kra_tx_t * -kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie) -{ - unsigned long flags; - struct list_head *ttmp; - kra_tx_t *tx; - - list_for_each(ttmp, &conn->rac_replyq) { - tx = list_entry(ttmp, kra_tx_t, tx_list); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_msg.ram_type != type) { - CWARN("Unexpected type %x (%x expected) " - "matched reply from "LPX64"\n", - tx->tx_msg.ram_type, type, - conn->rac_peer->rap_nid); - return NULL; - } - } - - CWARN("Unmatched reply from "LPX64"\n", conn->rac_peer->rap_nid); - return NULL; -} - -int -kranal_process_receives(kra_conn_t *conn) -{ - unsigned long flags; - __u32 seq; - __u32 nob; - kra_tx_t *tx; - kra_msg_t *msg; - void *prefix; - RAP_RETURN rrc = RapkFmaGetPrefix(conn->rac_rihandle, &prefix); - kra_peer_t *peer = conn->rac_peer; - - if (rrc == RAP_NOT_DONE) - return 0; - - LASSERT (rrc == RAP_SUCCESS); - conn->rac_last_rx = jiffies; - seq = conn->rac_rx_seq++; - msg = (kra_msg_t *)prefix; - - if (msg->ram_magic != RANAL_MSG_MAGIC) { - if (__swab32(msg->ram_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x from "LPX64"\n", - msg->ram_magic, peer->rap_nid); - goto out; - } - - __swab32s(&msg->ram_magic); - __swab16s(&msg->ram_version); - __swab16s(&msg->ram_type); - __swab64s(&msg->ram_srcnid); - __swab64s(&msg->ram_incarnation); - __swab32s(&msg->ram_seq); - - /* NB message type checked below; NOT here... */ - switch (msg->ram_type) { - case RANAL_MSG_PUT_ACK: - kranal_swab_rdma_desc(&msg->ram_u.putack.rapam_desc); - break; - - case RANAL_MSG_GET_REQ: - kranal_swab_rdma_desc(&msg->ram_u.get.ragm_desc); - break; - - default: - break; - } - } - - if (msg->ram_version != RANAL_MSG_VERSION) { - CERROR("Unexpected protocol version %d from "LPX64"\n", - msg->ram_version, peer->rap_nid); - goto out; - } - - if (msg->ram_srcnid != peer->rap_nid) { - CERROR("Unexpected peer "LPX64" from "LPX64"\n", - msg->ram_srcnid, peer->rap_nid); - goto out; - } - - if (msg->ram_incarnation != conn->rac_peer_incarnation) { - CERROR("Unexpected incarnation "LPX64"("LPX64 - " expected) from "LPX64"\n", - msg->ram_incarnation, conn->rac_peer_incarnation, - peer->rap_nid); - goto out; - } - - if (msg->ram_seq != seq) { - CERROR("Unexpected sequence number %d(%d expected) from " - LPX64"\n", msg->ram_seq, seq, peer->rap_nid); - goto out; - } - - if ((msg->ram_type & RANAL_MSG_FENCE) != 0) { - /* This message signals RDMA completion: wait now... */ - rrc = RapkFmaSyncWait(conn->rac_rihandle); - LASSERT (rrc == RAP_SUCCESS); - } - - if (msg->ram_type == RANAL_MSG_CLOSE) { - conn->rac_close_recvd = 1; - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - - if (!conn->rac_closing) - kranal_close_conn_locked(conn, -ETIMEDOUT); - else if (conn->rac_close_sent) - kranal_terminate_conn_locked(conn); - - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - goto out; - } - - if (conn->rac_closing) - goto out; - - conn->rac_rxmsg = msg; /* stash message for portals callbacks */ - /* they'll NULL rac_rxmsg if they consume it */ - switch (msg->ram_type) { - case RANAL_MSG_NOOP: - /* Nothing to do; just a keepalive */ - break; - - case RANAL_MSG_IMMEDIATE: - lib_parse(&kranal_lib, &msg->ram_u.immediate.raim_hdr, conn); - break; - - case RANAL_MSG_PUT_REQ: - lib_parse(&kranal_lib, &msg->ram_u.putreq.raprm_hdr, conn); - - if (conn->rac_rxmsg == NULL) /* lib_parse matched something */ - break; - - tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_NAK); - if (tx == NULL) - break; - - tx->tx_msg.ram_u.completion.racm_cookie = - msg->ram_u.putreq.raprm_cookie; - kranal_post_fma(conn, tx); - break; - - case RANAL_MSG_PUT_NAK: - tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, -ENOENT); /* no match */ - break; - - case RANAL_MSG_PUT_ACK: - tx = kranal_match_reply(conn, RANAL_MSG_PUT_REQ, - msg->ram_u.putack.rapam_src_cookie); - if (tx == NULL) - break; - - kranal_rdma(tx, RANAL_MSG_PUT_DONE, - &msg->ram_u.putack.rapam_desc, - msg->ram_u.putack.rapam_desc.rard_nob, - msg->ram_u.putack.rapam_dst_cookie); - break; - - case RANAL_MSG_PUT_DONE: - tx = kranal_match_reply(conn, RANAL_MSG_PUT_ACK, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, 0); - break; - - case RANAL_MSG_GET_REQ: - lib_parse(&kranal_lib, &msg->ram_u.get.ragm_hdr, conn); - - if (conn->rac_rxmsg == NULL) /* lib_parse matched something */ - break; - - tx = kranal_new_tx_msg(0, RANAL_MSG_GET_NAK); - if (tx == NULL) - break; - - tx->tx_msg.ram_u.completion.racm_cookie = msg->ram_u.get.ragm_cookie; - kranal_post_fma(conn, tx); - break; - - case RANAL_MSG_GET_NAK: - tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, -ENOENT); /* no match */ - break; - - case RANAL_MSG_GET_DONE: - tx = kranal_match_reply(conn, RANAL_MSG_GET_REQ, - msg->ram_u.completion.racm_cookie); - if (tx == NULL) - break; - - LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || - tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); - kranal_tx_done(tx, 0); - break; - } - - out: - if (conn->rac_rxmsg != NULL) - kranal_consume_rxmsg(conn, NULL, 0); - - return 1; -} - -int -kranal_scheduler (void *arg) -{ - kra_device_t *dev = (kra_device_t *)arg; - wait_queue_t wait; - char name[16]; - kra_conn_t *conn; - unsigned long flags; - RAP_RETURN rrc; - int rc; - int resched; - int i; - __u32 cqid; - __u32 event_type; - int did_something; - int busy_loops = 0; - - snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx); - kportal_daemonize(name); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - spin_lock_irqsave(&dev->rad_lock, flags); - - while (!kranal_data.kra_shutdown) { - /* Safe: kra_shutdown only set when quiescent */ - - if (busy_loops++ >= RANAL_RESCHED) { - spin_unlock_irqrestore(&dev->rad_lock, flags); - - our_cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&dev->rad_lock, flags); - } - - did_something = 0; - - if (dev->rad_ready) { - dev->rad_ready = 0; - spin_unlock_irqrestore(&dev->rad_lock, flags); - - rrc = RapkCQDone(dev->rad_rdma_cq, &cqid, &event_type); - - LASSERT (rrc == RAP_SUCCESS || rrc == RAP_NOT_DONE); - LASSERT ((event_type & RAPK_CQ_EVENT_OVERRUN) == 0); - - if (rrc == RAP_SUCCESS) { - kranal_process_rdmaq(cqid); - did_something = 1; - } - - rrc = RapkCQDone(dev->rad_fma_cq, &cqid, &event_type); - LASSERT (rrc == RAP_SUCCESS || rrc == RAP_NOT_DONE); - - if (rrc == RAP_SUCCESS) { - if ((event_type & RAPK_CQ_EVENT_OVERRUN) != 0) - kranal_schedule_dev(dev); - else - kranal_schedule_cqid(cqid); - did_something = 1; - } - - spin_lock_irqsave(&dev->rad_lock, flags); - - /* If there were no completions to handle, I leave - * rad_ready clear. NB I cleared it BEFORE I checked - * the completion queues since I'm racing with the - * device callback. */ - - if (did_something) - dev->rad_ready = 1; - } - - if (!list_empty(&dev->rad_connq)) { - conn = list_entry(dev->rad_connq.next, - kra_conn_t, rac_schedlist); - list_del(&conn->rac_schedlist); - spin_unlock_irqrestore(&dev->rad_lock, flags); - - LASSERT (conn->rac_scheduled); - - resched = kranal_process_fmaq(conn); - resched |= kranal_process_receives(conn); - did_something = 1; - - spin_lock_irqsave(&dev->rad_lock, flags); - if (resched) - list_add_tail(&conn->rac_schedlist, - &dev->rad_connq); - } - - if (did_something) - continue; - - add_wait_queue(&dev->rad_waitq, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - spin_unlock_irqrestore(&dev->rad_lock, flags); - - busy_loops = 0; - schedule(); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&dev->rad_waitq, &wait); - - spin_lock_irqsave(&dev->rad_lock, flags); - } - - spin_unlock_irqrestore(&dev->rad_lock, flags); - - kranal_thread_fini(); - return 0; -} - - -lib_nal_t kranal_lib = { - libnal_data: &kranal_data, /* NAL private data */ - libnal_send: kranal_send, - libnal_send_pages: kranal_send_pages, - libnal_recv: kranal_recv, - libnal_recv_pages: kranal_recv_pages, - libnal_dist: kranal_dist -}; diff --git a/lustre/portals/knals/socknal/.cvsignore b/lustre/portals/knals/socknal/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/knals/socknal/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/knals/socknal/Makefile.in b/lustre/portals/knals/socknal/Makefile.in deleted file mode 100644 index 633b455..0000000 --- a/lustre/portals/knals/socknal/Makefile.in +++ /dev/null @@ -1,8 +0,0 @@ -MODULES := ksocknal -ksocknal-objs := socknal.o socknal_cb.o - -# If you don't build with -O2, your modules won't insert, becahse htonl is -# just special that way. -EXTRA_POST_CFLAGS := -O2 - -@INCLUDE_RULES@ diff --git a/lustre/portals/knals/socknal/Makefile.mk b/lustre/portals/knals/socknal/Makefile.mk deleted file mode 100644 index 5c1b366..0000000 --- a/lustre/portals/knals/socknal/Makefile.mk +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../../Kernelenv - -obj-y += ksocknal.o -ksocknal-objs := socknal.o socknal_cb.o - diff --git a/lustre/portals/knals/socknal/autoMakefile.am b/lustre/portals/knals/socknal/autoMakefile.am deleted file mode 100644 index 070b649..0000000 --- a/lustre/portals/knals/socknal/autoMakefile.am +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -modulenet_DATA = ksocknal$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(ksocknal-objs:%.o=%.c) socknal.h diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c deleted file mode 100644 index 7642770..0000000 --- a/lustre/portals/knals/socknal/socknal.c +++ /dev/null @@ -1,2531 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "socknal.h" - -nal_t ksocknal_api; -ksock_nal_data_t ksocknal_data; -ptl_handle_ni_t ksocknal_ni; -ksock_tunables_t ksocknal_tunables; - -kpr_nal_interface_t ksocknal_router_interface = { - kprni_nalid: SOCKNAL, - kprni_arg: &ksocknal_data, - kprni_fwd: ksocknal_fwd_packet, - kprni_notify: ksocknal_notify, -}; - -#ifdef CONFIG_SYSCTL -#define SOCKNAL_SYSCTL 200 - -#define SOCKNAL_SYSCTL_TIMEOUT 1 -#define SOCKNAL_SYSCTL_EAGER_ACK 2 -#define SOCKNAL_SYSCTL_ZERO_COPY 3 -#define SOCKNAL_SYSCTL_TYPED 4 -#define SOCKNAL_SYSCTL_MIN_BULK 5 -#define SOCKNAL_SYSCTL_BUFFER_SIZE 6 -#define SOCKNAL_SYSCTL_NAGLE 7 -#define SOCKNAL_SYSCTL_IRQ_AFFINITY 8 -#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE 9 -#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10 -#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11 - -static ctl_table ksocknal_ctl_table[] = { - {SOCKNAL_SYSCTL_TIMEOUT, "timeout", - &ksocknal_tunables.ksnd_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", - &ksocknal_tunables.ksnd_eager_ack, sizeof (int), - 0644, NULL, &proc_dointvec}, -#if SOCKNAL_ZC - {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", - &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_TYPED, "typed", - &ksocknal_tunables.ksnd_typed_conns, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", - &ksocknal_tunables.ksnd_min_bulk, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size", - &ksocknal_tunables.ksnd_buffer_size, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_NAGLE, "nagle", - &ksocknal_tunables.ksnd_nagle, sizeof(int), - 0644, NULL, &proc_dointvec}, -#if CPU_AFFINITY - {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity", - &ksocknal_tunables.ksnd_irq_affinity, sizeof(int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle", - &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count", - &ksocknal_tunables.ksnd_keepalive_count, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl", - &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; - -static ctl_table ksocknal_top_ctl_table[] = { - {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, - { 0 } -}; -#endif - -int -ksocknal_set_mynid(ptl_nid_t nid) -{ - lib_ni_t *ni = &ksocknal_lib.libnal_ni; - - /* FIXME: we have to do this because we call lib_init() at module - * insertion time, which is before we have 'mynid' available. lib_init - * sets the NAL's nid, which it uses to tell other nodes where packets - * are coming from. This is not a very graceful solution to this - * problem. */ - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - ni->ni_pid.nid = nid; - return (0); -} - -void -ksocknal_bind_irq (unsigned int irq) -{ -#if (defined(CONFIG_SMP) && CPU_AFFINITY) - int bind; - int cpu; - unsigned long flags; - char cmdline[64]; - ksock_irqinfo_t *info; - char *argv[] = {"/bin/sh", - "-c", - cmdline, - NULL}; - char *envp[] = {"HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; - - LASSERT (irq < NR_IRQS); - if (irq == 0) /* software NIC or affinity disabled */ - return; - - info = &ksocknal_data.ksnd_irqinfo[irq]; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - LASSERT (info->ksni_valid); - bind = !info->ksni_bound; - info->ksni_bound = 1; - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - if (!bind) /* bound already */ - return; - - cpu = ksocknal_irqsched2cpu(info->ksni_sched); - snprintf (cmdline, sizeof (cmdline), - "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); - - printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); - - /* FIXME: Find a better method of setting IRQ affinity... - */ - - USERMODEHELPER(argv[0], argv, envp); -#endif -} - -ksock_interface_t * -ksocknal_ip2iface(__u32 ip) -{ - int i; - ksock_interface_t *iface; - - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - LASSERT(i < SOCKNAL_MAX_INTERFACES); - iface = &ksocknal_data.ksnd_interfaces[i]; - - if (iface->ksni_ipaddr == ip) - return (iface); - } - - return (NULL); -} - -ksock_route_t * -ksocknal_create_route (__u32 ipaddr, int port) -{ - ksock_route_t *route; - - PORTAL_ALLOC (route, sizeof (*route)); - if (route == NULL) - return (NULL); - - atomic_set (&route->ksnr_refcount, 1); - route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies; - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; - route->ksnr_ipaddr = ipaddr; - route->ksnr_port = port; - route->ksnr_connecting = 0; - route->ksnr_connected = 0; - route->ksnr_deleted = 0; - route->ksnr_conn_count = 0; - route->ksnr_share_count = 0; - - return (route); -} - -void -ksocknal_destroy_route (ksock_route_t *route) -{ - if (route->ksnr_peer != NULL) - ksocknal_put_peer (route->ksnr_peer); - - PORTAL_FREE (route, sizeof (*route)); -} - -void -ksocknal_put_route (ksock_route_t *route) -{ - CDEBUG (D_OTHER, "putting route[%p] (%d)\n", - route, atomic_read (&route->ksnr_refcount)); - - LASSERT (atomic_read (&route->ksnr_refcount) > 0); - if (!atomic_dec_and_test (&route->ksnr_refcount)) - return; - - ksocknal_destroy_route (route); -} - -ksock_peer_t * -ksocknal_create_peer (ptl_nid_t nid) -{ - ksock_peer_t *peer; - - LASSERT (nid != PTL_NID_ANY); - - PORTAL_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) - return (NULL); - - memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ - - peer->ksnp_nid = nid; - atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ - peer->ksnp_closing = 0; - INIT_LIST_HEAD (&peer->ksnp_conns); - INIT_LIST_HEAD (&peer->ksnp_routes); - INIT_LIST_HEAD (&peer->ksnp_tx_queue); - - atomic_inc (&ksocknal_data.ksnd_npeers); - return (peer); -} - -void -ksocknal_destroy_peer (ksock_peer_t *peer) -{ - CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer); - - LASSERT (atomic_read (&peer->ksnp_refcount) == 0); - LASSERT (list_empty (&peer->ksnp_conns)); - LASSERT (list_empty (&peer->ksnp_routes)); - LASSERT (list_empty (&peer->ksnp_tx_queue)); - - PORTAL_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections and autoconnect routes keep a reference - * on their peer until they are destroyed, so we can be assured - * that _all_ state to do with this peer has been cleaned up when - * its refcount drops to zero. */ - atomic_dec (&ksocknal_data.ksnd_npeers); -} - -void -ksocknal_put_peer (ksock_peer_t *peer) -{ - CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", - peer, peer->ksnp_nid, - atomic_read (&peer->ksnp_refcount)); - - LASSERT (atomic_read (&peer->ksnp_refcount) > 0); - if (!atomic_dec_and_test (&peer->ksnp_refcount)) - return; - - ksocknal_destroy_peer (peer); -} - -ksock_peer_t * -ksocknal_find_peer_locked (ptl_nid_t nid) -{ - struct list_head *peer_list = ksocknal_nid2peerlist (nid); - struct list_head *tmp; - ksock_peer_t *peer; - - list_for_each (tmp, peer_list) { - - peer = list_entry (tmp, ksock_peer_t, ksnp_list); - - LASSERT (!peer->ksnp_closing); - - if (peer->ksnp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ksnp_refcount)); - return (peer); - } - return (NULL); -} - -ksock_peer_t * -ksocknal_get_peer (ptl_nid_t nid) -{ - ksock_peer_t *peer; - - read_lock (&ksocknal_data.ksnd_global_lock); - peer = ksocknal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - atomic_inc (&peer->ksnp_refcount); - read_unlock (&ksocknal_data.ksnd_global_lock); - - return (peer); -} - -void -ksocknal_unlink_peer_locked (ksock_peer_t *peer) -{ - int i; - __u32 ip; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); - ip = peer->ksnp_passive_ips[i]; - - ksocknal_ip2iface(ip)->ksni_npeers--; - } - - LASSERT (list_empty(&peer->ksnp_conns)); - LASSERT (list_empty(&peer->ksnp_routes)); - LASSERT (!peer->ksnp_closing); - peer->ksnp_closing = 1; - list_del (&peer->ksnp_list); - /* lose peerlist's ref */ - ksocknal_put_peer (peer); -} - -int -ksocknal_get_peer_info (int index, ptl_nid_t *nid, - __u32 *myip, __u32 *peer_ip, int *port, - int *conn_count, int *share_count) -{ - ksock_peer_t *peer; - struct list_head *ptmp; - ksock_route_t *route; - struct list_head *rtmp; - int i; - int j; - int rc = -ENOENT; - - read_lock (&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - - list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - - if (peer->ksnp_n_passive_ips == 0 && - list_empty(&peer->ksnp_routes)) { - if (index-- > 0) - continue; - - *nid = peer->ksnp_nid; - *myip = 0; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - for (j = 0; j < peer->ksnp_n_passive_ips; j++) { - if (index-- > 0) - continue; - - *nid = peer->ksnp_nid; - *myip = peer->ksnp_passive_ips[j]; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - list_for_each (rtmp, &peer->ksnp_routes) { - if (index-- > 0) - continue; - - route = list_entry(rtmp, ksock_route_t, - ksnr_list); - - *nid = peer->ksnp_nid; - *myip = route->ksnr_myipaddr; - *peer_ip = route->ksnr_ipaddr; - *port = route->ksnr_port; - *conn_count = route->ksnr_conn_count; - *share_count = route->ksnr_share_count; - rc = 0; - goto out; - } - } - } - out: - read_unlock (&ksocknal_data.ksnd_global_lock); - return (rc); -} - -void -ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) -{ - ksock_peer_t *peer = route->ksnr_peer; - int type = conn->ksnc_type; - ksock_interface_t *iface; - - conn->ksnc_route = route; - atomic_inc (&route->ksnr_refcount); - - if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { - if (route->ksnr_myipaddr == 0) { - /* route wasn't bound locally yet (the initial route) */ - CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(conn->ksnc_myipaddr)); - } else { - CWARN("Rebinding "LPX64" %u.%u.%u.%u from " - "%u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(conn->ksnc_myipaddr)); - - iface = ksocknal_ip2iface(route->ksnr_myipaddr); - if (iface != NULL) - iface->ksni_nroutes--; - } - route->ksnr_myipaddr = conn->ksnc_myipaddr; - iface = ksocknal_ip2iface(route->ksnr_myipaddr); - if (iface != NULL) - iface->ksni_nroutes++; - } - - route->ksnr_connected |= (1<ksnr_connecting &= ~(1<ksnr_conn_count++; - - /* Successful connection => further attempts can - * proceed immediately */ - route->ksnr_timeout = jiffies; - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; -} - -void -ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) -{ - struct list_head *tmp; - ksock_conn_t *conn; - int type; - ksock_route_t *route2; - - LASSERT (route->ksnr_peer == NULL); - LASSERT (route->ksnr_connecting == 0); - LASSERT (route->ksnr_connected == 0); - - /* LASSERT(unique) */ - list_for_each(tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, ksock_route_t, ksnr_list); - - if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { - CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n", - peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr)); - LBUG(); - } - } - - route->ksnr_peer = peer; - atomic_inc (&peer->ksnp_refcount); - /* peer's routelist takes over my ref on 'route' */ - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - type = conn->ksnc_type; - - if (conn->ksnc_ipaddr != route->ksnr_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - /* keep going (typed routes) */ - } -} - -void -ksocknal_del_route_locked (ksock_route_t *route) -{ - ksock_peer_t *peer = route->ksnr_peer; - ksock_interface_t *iface; - ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - - LASSERT (!route->ksnr_deleted); - - /* Close associated conns */ - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_route != route) - continue; - - ksocknal_close_conn_locked (conn, 0); - } - - if (route->ksnr_myipaddr != 0) { - iface = ksocknal_ip2iface(route->ksnr_myipaddr); - if (iface != NULL) - iface->ksni_nroutes--; - } - - route->ksnr_deleted = 1; - list_del (&route->ksnr_list); - ksocknal_put_route (route); /* drop peer's ref */ - - if (list_empty (&peer->ksnp_routes) && - list_empty (&peer->ksnp_conns)) { - /* I've just removed the last autoconnect route of a peer - * with no active connections */ - ksocknal_unlink_peer_locked (peer); - } -} - -int -ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port) -{ - unsigned long flags; - struct list_head *tmp; - ksock_peer_t *peer; - ksock_peer_t *peer2; - ksock_route_t *route; - ksock_route_t *route2; - - if (nid == PTL_NID_ANY) - return (-EINVAL); - - /* Have a brand new peer ready... */ - peer = ksocknal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); - - route = ksocknal_create_route (ipaddr, port); - if (route == NULL) { - ksocknal_put_peer (peer); - return (-ENOMEM); - } - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - peer2 = ksocknal_find_peer_locked (nid); - if (peer2 != NULL) { - ksocknal_put_peer (peer); - peer = peer2; - } else { - /* peer table takes my ref on peer */ - list_add_tail (&peer->ksnp_list, - ksocknal_nid2peerlist (nid)); - } - - route2 = NULL; - list_for_each (tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, ksock_route_t, ksnr_list); - - if (route2->ksnr_ipaddr == ipaddr) - break; - - route2 = NULL; - } - if (route2 == NULL) { - ksocknal_add_route_locked(peer, route); - route->ksnr_share_count++; - } else { - ksocknal_put_route(route); - route2->ksnr_share_count++; - } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - return (0); -} - -void -ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) -{ - ksock_conn_t *conn; - ksock_route_t *route; - struct list_head *tmp; - struct list_head *nxt; - int nshared; - - LASSERT (!peer->ksnp_closing); - - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); - - if (single_share && route->ksnr_share_count == 0) - continue; - - /* no match */ - if (!(ip == 0 || route->ksnr_ipaddr == ip)) - continue; - - if (!single_share) - route->ksnr_share_count = 0; - else if (route->ksnr_share_count > 0) - route->ksnr_share_count--; - - if (route->ksnr_share_count == 0) { - /* This deletes associated conns too */ - ksocknal_del_route_locked (route); - } - - if (single_share) - break; - } - - nshared = 0; - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); - nshared += route->ksnr_share_count; - } - - if (nshared == 0) { - /* remove everything else if there are no explicit entries - * left */ - - list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); - - /* we should only be removing auto-entries */ - LASSERT(route->ksnr_share_count == 0); - ksocknal_del_route_locked (route); - } - - list_for_each_safe (tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - - ksocknal_close_conn_locked(conn, 0); - } - } - - /* NB peer unlinks itself when last conn/route is removed */ -} - -int -ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) -{ - unsigned long flags; - struct list_head *ptmp; - struct list_head *pnxt; - ksock_peer_t *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; - else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - - if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) - continue; - - ksocknal_del_peer_locked (peer, ip, single_share); - rc = 0; /* matched! */ - - if (single_share) - break; - } - } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - return (rc); -} - -ksock_conn_t * -ksocknal_get_conn_by_idx (int index) -{ - ksock_peer_t *peer; - struct list_head *ptmp; - ksock_conn_t *conn; - struct list_head *ctmp; - int i; - - read_lock (&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - - LASSERT (!peer->ksnp_closing); - - list_for_each (ctmp, &peer->ksnp_conns) { - if (index-- > 0) - continue; - - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); - read_unlock (&ksocknal_data.ksnd_global_lock); - return (conn); - } - } - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - return (NULL); -} - -int -ksocknal_get_conn_addrs (ksock_conn_t *conn) -{ - struct sockaddr_in sin; - int len = sizeof (sin); - int rc; - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 2); - /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT (!conn->ksnc_closing); - - if (rc != 0) { - CERROR ("Error %d getting sock peer IP\n", rc); - return rc; - } - - conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); - conn->ksnc_port = ntohs (sin.sin_port); - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 0); - if (rc != 0) { - CERROR ("Error %d getting sock local IP\n", rc); - return rc; - } - - conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr); - - return 0; -} - -unsigned int -ksocknal_sock_irq (struct socket *sock) -{ - int irq = 0; - struct dst_entry *dst; - - if (!ksocknal_tunables.ksnd_irq_affinity) - return 0; - - dst = sk_dst_get (sock->sk); - if (dst != NULL) { - if (dst->dev != NULL) { - irq = dst->dev->irq; - if (irq >= NR_IRQS) { - CERROR ("Unexpected IRQ %x\n", irq); - irq = 0; - } - } - dst_release (dst); - } - - return (irq); -} - -ksock_sched_t * -ksocknal_choose_scheduler_locked (unsigned int irq) -{ - ksock_sched_t *sched; - ksock_irqinfo_t *info; - int i; - - LASSERT (irq < NR_IRQS); - info = &ksocknal_data.ksnd_irqinfo[irq]; - - if (irq != 0 && /* hardware NIC */ - info->ksni_valid) { /* already set up */ - return (&ksocknal_data.ksnd_schedulers[info->ksni_sched]); - } - - /* software NIC (irq == 0) || not associated with a scheduler yet. - * Choose the CPU with the fewest connections... */ - sched = &ksocknal_data.ksnd_schedulers[0]; - for (i = 1; i < ksocknal_data.ksnd_nschedulers; i++) - if (sched->kss_nconns > - ksocknal_data.ksnd_schedulers[i].kss_nconns) - sched = &ksocknal_data.ksnd_schedulers[i]; - - if (irq != 0) { /* Hardware NIC */ - info->ksni_valid = 1; - info->ksni_sched = sched - ksocknal_data.ksnd_schedulers; - - /* no overflow... */ - LASSERT (info->ksni_sched == sched - ksocknal_data.ksnd_schedulers); - } - - return (sched); -} - -int -ksocknal_local_ipvec (__u32 *ipaddrs) -{ - int i; - int nip; - - read_lock (&ksocknal_data.ksnd_global_lock); - - nip = ksocknal_data.ksnd_ninterfaces; - for (i = 0; i < nip; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); - - ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; - LASSERT (ipaddrs[i] != 0); - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - return (nip); -} - -int -ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips) -{ - int best_netmatch = 0; - int best_xor = 0; - int best = -1; - int this_xor; - int this_netmatch; - int i; - - for (i = 0; i < nips; i++) { - if (ips[i] == 0) - continue; - - this_xor = (ips[i] ^ iface->ksni_ipaddr); - this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; - - if (!(best < 0 || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_xor > this_xor))) - continue; - - best = i; - best_netmatch = this_netmatch; - best_xor = this_xor; - } - - LASSERT (best >= 0); - return (best); -} - -int -ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) -{ - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; - ksock_interface_t *iface; - ksock_interface_t *best_iface; - int n_ips; - int i; - int j; - int k; - __u32 ip; - __u32 xor; - int this_netmatch; - int best_netmatch; - int best_npeers; - - /* CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness shouldn't matter */ - - /* Also note that I'm not going to return more than n_peerips - * interfaces, even if I have more myself */ - - write_lock_irqsave(global_lock, flags); - - LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES); - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); - - n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces); - - for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { - /* ^ yes really... */ - - /* If we have any new interfaces, first tick off all the - * peer IPs that match old interfaces, then choose new - * interfaces to match the remaining peer IPS. - * We don't forget interfaces we've stopped using; we might - * start using them again... */ - - if (i < peer->ksnp_n_passive_ips) { - /* Old interface. */ - ip = peer->ksnp_passive_ips[i]; - best_iface = ksocknal_ip2iface(ip); - - /* peer passive ips are kept up to date */ - LASSERT(best_iface != NULL); - } else { - /* choose a new interface */ - LASSERT (i == peer->ksnp_n_passive_ips); - - best_iface = NULL; - best_netmatch = 0; - best_npeers = 0; - - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; - ip = iface->ksni_ipaddr; - - for (k = 0; k < peer->ksnp_n_passive_ips; k++) - if (peer->ksnp_passive_ips[k] == ip) - break; - - if (k < peer->ksnp_n_passive_ips) /* using it already */ - continue; - - k = ksocknal_match_peerip(iface, peerips, n_peerips); - xor = (ip ^ peerips[k]); - this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; - - if (!(best_iface == NULL || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_npeers > iface->ksni_npeers))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_npeers = iface->ksni_npeers; - } - - best_iface->ksni_npeers++; - ip = best_iface->ksni_ipaddr; - peer->ksnp_passive_ips[i] = ip; - peer->ksnp_n_passive_ips = i+1; - } - - LASSERT (best_iface != NULL); - - /* mark the best matching peer IP used */ - j = ksocknal_match_peerip(best_iface, peerips, n_peerips); - peerips[j] = 0; - } - - /* Overwrite input peer IP addresses */ - memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); - - write_unlock_irqrestore(global_lock, flags); - - return (n_ips); -} - -void -ksocknal_create_routes(ksock_peer_t *peer, int port, - __u32 *peer_ipaddrs, int npeer_ipaddrs) -{ - ksock_route_t *newroute = NULL; - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; - struct list_head *rtmp; - ksock_route_t *route; - ksock_interface_t *iface; - ksock_interface_t *best_iface; - int best_netmatch; - int this_netmatch; - int best_nroutes; - int i; - int j; - - /* CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness here shouldn't matter */ - - write_lock_irqsave(global_lock, flags); - - LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES); - - for (i = 0; i < npeer_ipaddrs; i++) { - if (newroute != NULL) { - newroute->ksnr_ipaddr = peer_ipaddrs[i]; - } else { - write_unlock_irqrestore(global_lock, flags); - - newroute = ksocknal_create_route(peer_ipaddrs[i], port); - if (newroute == NULL) - return; - - write_lock_irqsave(global_lock, flags); - } - - /* Already got a route? */ - route = NULL; - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); - - if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) - break; - - route = NULL; - } - if (route != NULL) - continue; - - best_iface = NULL; - best_nroutes = 0; - best_netmatch = 0; - - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); - - /* Select interface to connect from */ - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; - - /* Using this interface already? */ - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); - - if (route->ksnr_myipaddr == iface->ksni_ipaddr) - break; - - route = NULL; - } - if (route != NULL) - continue; - - this_netmatch = (((iface->ksni_ipaddr ^ - newroute->ksnr_ipaddr) & - iface->ksni_netmask) == 0) ? 1 : 0; - - if (!(best_iface == NULL || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_nroutes > iface->ksni_nroutes))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_nroutes = iface->ksni_nroutes; - } - - if (best_iface == NULL) - continue; - - newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; - best_iface->ksni_nroutes++; - - ksocknal_add_route_locked(peer, newroute); - newroute = NULL; - } - - write_unlock_irqrestore(global_lock, flags); - if (newroute != NULL) - ksocknal_put_route(newroute); -} - -int -ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) -{ - int passive = (type == SOCKNAL_CONN_NONE); - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - __u32 ipaddrs[SOCKNAL_MAX_INTERFACES]; - int nipaddrs; - ptl_nid_t nid; - struct list_head *tmp; - __u64 incarnation; - unsigned long flags; - ksock_conn_t *conn; - ksock_conn_t *conn2; - ksock_peer_t *peer = NULL; - ksock_peer_t *peer2; - ksock_sched_t *sched; - unsigned int irq; - ksock_tx_t *tx; - int rc; - - /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need to refcount the - * socket so that we don't close it while I/O is being done on - * it, and sock->file has that pre-cooked... */ - LASSERT (sock->file != NULL); - LASSERT (file_count(sock->file) > 0); - LASSERT (route == NULL || !passive); - - rc = ksocknal_setup_sock (sock); - if (rc != 0) - return (rc); - - irq = ksocknal_sock_irq (sock); - - PORTAL_ALLOC(conn, sizeof(*conn)); - if (conn == NULL) - return (-ENOMEM); - - memset (conn, 0, sizeof (*conn)); - conn->ksnc_peer = NULL; - conn->ksnc_route = NULL; - conn->ksnc_sock = sock; - conn->ksnc_type = type; - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; - atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ - - conn->ksnc_rx_ready = 0; - conn->ksnc_rx_scheduled = 0; - ksocknal_new_packet (conn, 0); - - INIT_LIST_HEAD (&conn->ksnc_tx_queue); - conn->ksnc_tx_ready = 0; - conn->ksnc_tx_scheduled = 0; - atomic_set (&conn->ksnc_tx_nob, 0); - - /* stash conn's local and remote addrs */ - rc = ksocknal_get_conn_addrs (conn); - if (rc != 0) - goto failed_0; - - if (!passive) { - /* Active connection sends HELLO eagerly */ - rc = ksocknal_local_ipvec(ipaddrs); - if (rc < 0) - goto failed_0; - nipaddrs = rc; - - rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs); - if (rc != 0) - goto failed_0; - } - - /* Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to */ - nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid; - rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs); - if (rc < 0) - goto failed_0; - nipaddrs = rc; - LASSERT (nid != PTL_NID_ANY); - - if (route != NULL) { - peer = route->ksnr_peer; - atomic_inc(&peer->ksnp_refcount); - } else { - peer = ksocknal_create_peer(nid); - if (peer == NULL) { - rc = -ENOMEM; - goto failed_0; - } - - write_lock_irqsave(global_lock, flags); - - peer2 = ksocknal_find_peer_locked(nid); - if (peer2 == NULL) { - /* NB this puts an "empty" peer in the peer - * table (which takes my ref) */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(nid)); - } else { - ksocknal_put_peer(peer); - peer = peer2; - } - /* +1 ref for me */ - atomic_inc(&peer->ksnp_refcount); - - write_unlock_irqrestore(global_lock, flags); - } - - if (!passive) { - ksocknal_create_routes(peer, conn->ksnc_port, - ipaddrs, nipaddrs); - rc = 0; - } else { - rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs); - LASSERT (rc >= 0); - rc = ksocknal_send_hello (conn, ipaddrs, rc); - } - if (rc < 0) - goto failed_1; - - write_lock_irqsave (global_lock, flags); - - if (peer->ksnp_closing || - (route != NULL && route->ksnr_deleted)) { - /* route/peer got closed under me */ - rc = -ESTALE; - goto failed_2; - } - - /* Refuse to duplicate an existing connection (both sides might - * autoconnect at once), unless this is a loopback connection */ - if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); - - if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || - conn2->ksnc_myipaddr != conn->ksnc_myipaddr || - conn2->ksnc_type != conn->ksnc_type || - conn2->ksnc_incarnation != incarnation) - continue; - - CWARN("Not creating duplicate connection to " - "%u.%u.%u.%u type %d\n", - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); - rc = -EALREADY; - goto failed_2; - } - } - - /* If the connection created by this route didn't bind to the IP - * address the route connected to, the connection/route matching - * code below probably isn't going to work. */ - if (route != NULL && - route->ksnr_ipaddr != conn->ksnc_ipaddr) { - CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(conn->ksnc_ipaddr)); - } - - /* Search for a route corresponding to the new connection and - * create an association. This allows incoming connections created - * by routes in my peer to match my own route entries so I don't - * continually create duplicate routes. */ - list_for_each (tmp, &peer->ksnp_routes) { - route = list_entry(tmp, ksock_route_t, ksnr_list); - - if (route->ksnr_ipaddr != conn->ksnc_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - break; - } - - /* Give conn a ref on sock->file since we're going to return success */ - get_file(sock->file); - - conn->ksnc_peer = peer; /* conn takes my ref on peer */ - conn->ksnc_incarnation = incarnation; - peer->ksnp_last_alive = jiffies; - peer->ksnp_error = 0; - - sched = ksocknal_choose_scheduler_locked (irq); - sched->kss_nconns++; - conn->ksnc_scheduler = sched; - - /* Set the deadline for the outgoing HELLO to drain */ - conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; - conn->ksnc_tx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - mb(); /* order with adding to peer's conn list */ - - list_add (&conn->ksnc_list, &peer->ksnp_conns); - atomic_inc (&conn->ksnc_refcount); - - /* NB my callbacks block while I hold ksnd_global_lock */ - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; - - /* Take all the packets blocking for a connection. - * NB, it might be nicer to share these blocked packets among any - * other connections that are becoming established. */ - while (!list_empty (&peer->ksnp_tx_queue)) { - tx = list_entry (peer->ksnp_tx_queue.next, - ksock_tx_t, tx_list); - - list_del (&tx->tx_list); - ksocknal_queue_tx_locked (tx, conn); - } - - rc = ksocknal_close_stale_conns_locked(peer, incarnation); - if (rc != 0) - CERROR ("Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", - rc, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr)); - - write_unlock_irqrestore (global_lock, flags); - - ksocknal_bind_irq (irq); - - /* Call the callbacks right now to get things going. */ - if (ksocknal_getconnsock(conn) == 0) { - ksocknal_data_ready (sock->sk, 0); - ksocknal_write_space (sock->sk); - ksocknal_putconnsock(conn); - } - - CWARN("New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d" - " incarnation:"LPX64" sched[%d]/%d\n", - nid, HIPQUAD(conn->ksnc_myipaddr), - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, - (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); - - ksocknal_put_conn (conn); - return (0); - - failed_2: - if (!peer->ksnp_closing && - list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) - ksocknal_unlink_peer_locked(peer); - write_unlock_irqrestore(global_lock, flags); - - failed_1: - ksocknal_put_peer (peer); - - failed_0: - PORTAL_FREE (conn, sizeof(*conn)); - - LASSERT (rc != 0); - return (rc); -} - -void -ksocknal_close_conn_locked (ksock_conn_t *conn, int error) -{ - /* This just does the immmediate housekeeping, and queues the - * connection for the reaper to terminate. - * Caller holds ksnd_global_lock exclusively in irq context */ - ksock_peer_t *peer = conn->ksnc_peer; - ksock_route_t *route; - ksock_conn_t *conn2; - struct list_head *tmp; - - LASSERT (peer->ksnp_error == 0); - LASSERT (!conn->ksnc_closing); - conn->ksnc_closing = 1; - atomic_inc (&ksocknal_data.ksnd_nclosing_conns); - - /* ksnd_deathrow_conns takes over peer's ref */ - list_del (&conn->ksnc_list); - - route = conn->ksnc_route; - if (route != NULL) { - /* dissociate conn from route... */ - LASSERT (!route->ksnr_deleted); - LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0); - LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); - - conn2 = NULL; - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); - - if (conn2->ksnc_route == route && - conn2->ksnc_type == conn->ksnc_type) - break; - - conn2 = NULL; - } - if (conn2 == NULL) - route->ksnr_connected &= ~(1 << conn->ksnc_type); - - conn->ksnc_route = NULL; - -#if 0 /* irrelevent with only eager routes */ - list_del (&route->ksnr_list); /* make route least favourite */ - list_add_tail (&route->ksnr_list, &peer->ksnp_routes); -#endif - ksocknal_put_route (route); /* drop conn's ref on route */ - } - - if (list_empty (&peer->ksnp_conns)) { - /* No more connections to this peer */ - - peer->ksnp_error = error; /* stash last conn close reason */ - - if (list_empty (&peer->ksnp_routes)) { - /* I've just closed last conn belonging to a - * non-autoconnecting peer */ - ksocknal_unlink_peer_locked (peer); - } - } - - spin_lock (&ksocknal_data.ksnd_reaper_lock); - - list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); - wake_up (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock (&ksocknal_data.ksnd_reaper_lock); -} - -void -ksocknal_terminate_conn (ksock_conn_t *conn) -{ - /* This gets called by the reaper (guaranteed thread context) to - * disengage the socket from its callbacks and close it. - * ksnc_refcount will eventually hit zero, and then the reaper will - * destroy it. */ - unsigned long flags; - ksock_peer_t *peer = conn->ksnc_peer; - ksock_sched_t *sched = conn->ksnc_scheduler; - struct timeval now; - time_t then = 0; - int notify = 0; - - LASSERT(conn->ksnc_closing); - - /* wake up the scheduler to "send" all remaining packets to /dev/null */ - spin_lock_irqsave(&sched->kss_lock, flags); - - if (!conn->ksnc_tx_scheduled && - !list_empty(&conn->ksnc_tx_queue)){ - list_add_tail (&conn->ksnc_tx_list, - &sched->kss_tx_conns); - /* a closing conn is always ready to tx */ - conn->ksnc_tx_ready = 1; - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); - - wake_up (&sched->kss_waitq); - } - - spin_unlock_irqrestore (&sched->kss_lock, flags); - - /* serialise with callbacks */ - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - /* Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! */ - conn->ksnc_sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - conn->ksnc_sock->sk->sk_write_space = conn->ksnc_saved_write_space; - - /* A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. */ - conn->ksnc_sock->sk->sk_user_data = NULL; - - /* OK, so this conn may not be completely disengaged from its - * scheduler yet, but it _has_ committed to terminate... */ - conn->ksnc_scheduler->kss_nconns--; - - if (peer->ksnp_error != 0) { - /* peer's last conn closed in error */ - LASSERT (list_empty (&peer->ksnp_conns)); - - /* convert peer's last-known-alive timestamp from jiffies */ - do_gettimeofday (&now); - then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; - notify = 1; - } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - /* The socket is closed on the final put; either here, or in - * ksocknal_{send,recv}msg(). Since we set up the linger2 option - * when the connection was established, this will close the socket - * immediately, aborting anything buffered in it. Any hung - * zero-copy transmits will therefore complete in finite time. */ - ksocknal_putconnsock (conn); - - if (notify) - kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, - 0, then); -} - -void -ksocknal_destroy_conn (ksock_conn_t *conn) -{ - /* Final coup-de-grace of the reaper */ - CDEBUG (D_NET, "connection %p\n", conn); - - LASSERT (atomic_read (&conn->ksnc_refcount) == 0); - LASSERT (conn->ksnc_route == NULL); - LASSERT (!conn->ksnc_tx_scheduled); - LASSERT (!conn->ksnc_rx_scheduled); - LASSERT (list_empty(&conn->ksnc_tx_queue)); - - /* complete current receive if any */ - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_BODY: - CERROR("Completing partial receive from "LPX64 - ", ip %d.%d.%d.%d:%d, with error\n", - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); - break; - case SOCKNAL_RX_BODY_FWD: - ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); - break; - case SOCKNAL_RX_HEADER: - case SOCKNAL_RX_SLOP: - break; - default: - LBUG (); - break; - } - - ksocknal_put_peer (conn->ksnc_peer); - - PORTAL_FREE (conn, sizeof (*conn)); - atomic_dec (&ksocknal_data.ksnd_nclosing_conns); -} - -void -ksocknal_put_conn (ksock_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", - conn, conn->ksnc_peer->ksnp_nid, - atomic_read (&conn->ksnc_refcount)); - - LASSERT (atomic_read (&conn->ksnc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ksnc_refcount)) - return; - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - - list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - wake_up (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); -} - -int -ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why) -{ - ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - - if (ipaddr == 0 || - conn->ksnc_ipaddr == ipaddr) { - count++; - ksocknal_close_conn_locked (conn, why); - } - } - - return (count); -} - -int -ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation) -{ - ksock_conn_t *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_incarnation == incarnation) - continue; - - CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d " - "incarnation:"LPX64"("LPX64")\n", - peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_incarnation, incarnation); - - count++; - ksocknal_close_conn_locked (conn, -ESTALE); - } - - return (count); -} - -int -ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) -{ - ksock_peer_t *peer = conn->ksnc_peer; - __u32 ipaddr = conn->ksnc_ipaddr; - unsigned long flags; - int count; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - count = ksocknal_close_peer_conns_locked (peer, ipaddr, why); - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - return (count); -} - -int -ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) -{ - unsigned long flags; - ksock_peer_t *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; - else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { - - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - - if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) - continue; - - count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0); - } - } - - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - /* wildcards always succeed */ - if (nid == PTL_NID_ANY || ipaddr == 0) - return (0); - - return (count == 0 ? -ENOENT : 0); -} - -void -ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) -{ - /* The router is telling me she's been notified of a change in - * gateway state.... */ - - CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); - - if (!alive) { - /* If the gateway crashed, close all open connections... */ - ksocknal_close_matching_conns (gw_nid, 0); - return; - } - - /* ...otherwise do nothing. We can only establish new connections - * if we have autroutes, and these connect on demand. */ -} - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - return &(sk->tp_pinfo.af_tcp); -} -#else -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - struct tcp_sock *s = (struct tcp_sock *)sk; - return &s->tcp; -} -#endif - -void -ksocknal_push_conn (ksock_conn_t *conn) -{ - struct sock *sk; - struct tcp_opt *tp; - int nonagle; - int val = 1; - int rc; - mm_segment_t oldmm; - - rc = ksocknal_getconnsock (conn); - if (rc != 0) /* being shut down */ - return; - - sk = conn->ksnc_sock->sk; - tp = sock2tcp_opt(sk); - - lock_sock (sk); - nonagle = tp->nonagle; - tp->nonagle = 1; - release_sock (sk); - - oldmm = get_fs (); - set_fs (KERNEL_DS); - - rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof (val)); - LASSERT (rc == 0); - - set_fs (oldmm); - - lock_sock (sk); - tp->nonagle = nonagle; - release_sock (sk); - - ksocknal_putconnsock (conn); -} - -void -ksocknal_push_peer (ksock_peer_t *peer) -{ - int index; - int i; - struct list_head *tmp; - ksock_conn_t *conn; - - for (index = 0; ; index++) { - read_lock (&ksocknal_data.ksnd_global_lock); - - i = 0; - conn = NULL; - - list_for_each (tmp, &peer->ksnp_conns) { - if (i++ == index) { - conn = list_entry (tmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); - break; - } - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - - if (conn == NULL) - break; - - ksocknal_push_conn (conn); - ksocknal_put_conn (conn); - } -} - -int -ksocknal_push (ptl_nid_t nid) -{ - ksock_peer_t *peer; - struct list_head *tmp; - int index; - int i; - int j; - int rc = -ENOENT; - - if (nid != PTL_NID_ANY) { - peer = ksocknal_get_peer (nid); - - if (peer != NULL) { - rc = 0; - ksocknal_push_peer (peer); - ksocknal_put_peer (peer); - } - return (rc); - } - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - for (j = 0; ; j++) { - read_lock (&ksocknal_data.ksnd_global_lock); - - index = 0; - peer = NULL; - - list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { - if (index++ == j) { - peer = list_entry(tmp, ksock_peer_t, - ksnp_list); - atomic_inc (&peer->ksnp_refcount); - break; - } - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - - if (peer != NULL) { - rc = 0; - ksocknal_push_peer (peer); - ksocknal_put_peer (peer); - } - } - - } - - return (rc); -} - -int -ksocknal_add_interface(__u32 ipaddress, __u32 netmask) -{ - unsigned long flags; - ksock_interface_t *iface; - int rc; - int i; - int j; - struct list_head *ptmp; - ksock_peer_t *peer; - struct list_head *rtmp; - ksock_route_t *route; - - if (ipaddress == 0 || - netmask == 0) - return (-EINVAL); - - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); - - iface = ksocknal_ip2iface(ipaddress); - if (iface != NULL) { - /* silently ignore dups */ - rc = 0; - } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) { - rc = -ENOSPC; - } else { - iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++]; - - iface->ksni_ipaddr = ipaddress; - iface->ksni_netmask = netmask; - iface->ksni_nroutes = 0; - iface->ksni_npeers = 0; - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, ksock_peer_t, ksnp_list); - - for (j = 0; i < peer->ksnp_n_passive_ips; j++) - if (peer->ksnp_passive_ips[j] == ipaddress) - iface->ksni_npeers++; - - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, ksock_route_t, ksnr_list); - - if (route->ksnr_myipaddr == ipaddress) - iface->ksni_nroutes++; - } - } - } - - rc = 0; - /* NB only new connections will pay attention to the new interface! */ - } - - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); - - return (rc); -} - -void -ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) -{ - struct list_head *tmp; - struct list_head *nxt; - ksock_route_t *route; - ksock_conn_t *conn; - int i; - int j; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) - if (peer->ksnp_passive_ips[i] == ipaddr) { - for (j = i+1; j < peer->ksnp_n_passive_ips; j++) - peer->ksnp_passive_ips[j-1] = - peer->ksnp_passive_ips[j]; - peer->ksnp_n_passive_ips--; - break; - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry (tmp, ksock_route_t, ksnr_list); - - if (route->ksnr_myipaddr != ipaddr) - continue; - - if (route->ksnr_share_count != 0) { - /* Manually created; keep, but unbind */ - route->ksnr_myipaddr = 0; - } else { - ksocknal_del_route_locked(route); - } - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, ksock_conn_t, ksnc_list); - - if (conn->ksnc_myipaddr == ipaddr) - ksocknal_close_conn_locked (conn, 0); - } -} - -int -ksocknal_del_interface(__u32 ipaddress) -{ - int rc = -ENOENT; - unsigned long flags; - struct list_head *tmp; - struct list_head *nxt; - ksock_peer_t *peer; - __u32 this_ip; - int i; - int j; - - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); - - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; - - if (!(ipaddress == 0 || - ipaddress == this_ip)) - continue; - - rc = 0; - - for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++) - ksocknal_data.ksnd_interfaces[j-1] = - ksocknal_data.ksnd_interfaces[j]; - - ksocknal_data.ksnd_ninterfaces--; - - for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { - list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) { - peer = list_entry(tmp, ksock_peer_t, ksnp_list); - - ksocknal_peer_del_interface_locked(peer, this_ip); - } - } - } - - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); - - return (rc); -} - -int -ksocknal_cmd(struct portals_cfg *pcfg, void * private) -{ - int rc; - - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_INTERFACE: { - ksock_interface_t *iface; - - read_lock (&ksocknal_data.ksnd_global_lock); - - if (pcfg->pcfg_count < 0 || - pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) { - rc = -ENOENT; - } else { - rc = 0; - iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count]; - - pcfg->pcfg_id = iface->ksni_ipaddr; - pcfg->pcfg_misc = iface->ksni_netmask; - pcfg->pcfg_fd = iface->ksni_npeers; - pcfg->pcfg_count = iface->ksni_nroutes; - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - break; - } - case NAL_CMD_ADD_INTERFACE: { - rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */ - pcfg->pcfg_misc); /* net mask */ - break; - } - case NAL_CMD_DEL_INTERFACE: { - rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */ - break; - } - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - __u32 myip = 0; - __u32 ip = 0; - int port = 0; - int conn_count = 0; - int share_count = 0; - - rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid, - &myip, &ip, &port, - &conn_count, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = myip; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = conn_count; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = ksocknal_add_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ - break; - } - case NAL_CMD_DEL_PEER: { - rc = ksocknal_del_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_flags); /* single_share? */ - break; - } - case NAL_CMD_GET_CONN: { - ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count); - - if (conn == NULL) - rc = -ENOENT; - else { - int txmem; - int rxmem; - int nagle; - - ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle); - - rc = 0; - pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; - pcfg->pcfg_id = conn->ksnc_ipaddr; - pcfg->pcfg_misc = conn->ksnc_port; - pcfg->pcfg_fd = conn->ksnc_myipaddr; - pcfg->pcfg_flags = conn->ksnc_type; - pcfg->pcfg_gw_nal = conn->ksnc_scheduler - - ksocknal_data.ksnd_schedulers; - pcfg->pcfg_count = txmem; - pcfg->pcfg_size = rxmem; - pcfg->pcfg_wait = nagle; - ksocknal_put_conn (conn); - } - break; - } - case NAL_CMD_REGISTER_PEER_FD: { - struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc); - int type = pcfg->pcfg_misc; - - if (sock == NULL) - break; - - switch (type) { - case SOCKNAL_CONN_NONE: - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - case SOCKNAL_CONN_BULK_IN: - case SOCKNAL_CONN_BULK_OUT: - rc = ksocknal_create_conn(NULL, sock, type); - break; - default: - rc = -EINVAL; - break; - } - fput (sock->file); - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_matching_conns (pcfg->pcfg_nid, - pcfg->pcfg_id); - break; - } - case NAL_CMD_REGISTER_MYNID: { - rc = ksocknal_set_mynid (pcfg->pcfg_nid); - break; - } - case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push (pcfg->pcfg_nid); - break; - } - default: - rc = -EINVAL; - break; - } - - return rc; -} - -void -ksocknal_free_fmbs (ksock_fmb_pool_t *p) -{ - int npages = p->fmp_buff_pages; - ksock_fmb_t *fmb; - int i; - - LASSERT (list_empty(&p->fmp_blocked_conns)); - LASSERT (p->fmp_nactive_fmbs == 0); - - while (!list_empty(&p->fmp_idle_fmbs)) { - - fmb = list_entry(p->fmp_idle_fmbs.next, - ksock_fmb_t, fmb_list); - - for (i = 0; i < npages; i++) - if (fmb->fmb_kiov[i].kiov_page != NULL) - __free_page(fmb->fmb_kiov[i].kiov_page); - - list_del(&fmb->fmb_list); - PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); - } -} - -void -ksocknal_free_buffers (void) -{ - ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp); - ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp); - - LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0); - - if (ksocknal_data.ksnd_schedulers != NULL) - PORTAL_FREE (ksocknal_data.ksnd_schedulers, - sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - - PORTAL_FREE (ksocknal_data.ksnd_peers, - sizeof (struct list_head) * - ksocknal_data.ksnd_peer_hash_size); -} - -void -ksocknal_api_shutdown (nal_t *nal) -{ - ksock_sched_t *sched; - int i; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &ksocknal_api); - - switch (ksocknal_data.ksnd_init) { - default: - LASSERT (0); - - case SOCKNAL_INIT_ALL: - libcfs_nal_cmd_unregister(SOCKNAL); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; - /* fall through */ - - case SOCKNAL_INIT_LIB: - /* No more calls to ksocknal_cmd() to create new - * autoroutes/connections since we're being unloaded. */ - - /* Delete all peers */ - ksocknal_del_peer(PTL_NID_ANY, 0, 0); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - atomic_read (&ksocknal_data.ksnd_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - /* Tell lib we've stopped calling into her. */ - lib_fini(&ksocknal_lib); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - /* fall through */ - - case SOCKNAL_INIT_DATA: - LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); - LASSERT (ksocknal_data.ksnd_peers != NULL); - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); - } - LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); - LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); - - if (ksocknal_data.ksnd_schedulers != NULL) - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - ksock_sched_t *kss = - &ksocknal_data.ksnd_schedulers[i]; - - LASSERT (list_empty (&kss->kss_tx_conns)); - LASSERT (list_empty (&kss->kss_rx_conns)); - LASSERT (kss->kss_nconns == 0); - } - - /* stop router calling me */ - kpr_shutdown (&ksocknal_data.ksnd_router); - - /* flag threads to terminate; wake and wait for them to die */ - ksocknal_data.ksnd_shuttingdown = 1; - wake_up_all (&ksocknal_data.ksnd_autoconnectd_waitq); - wake_up_all (&ksocknal_data.ksnd_reaper_waitq); - - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - sched = &ksocknal_data.ksnd_schedulers[i]; - wake_up_all(&sched->kss_waitq); - } - - i = 4; - read_lock(&ksocknal_data.ksnd_global_lock); - while (ksocknal_data.ksnd_nthreads != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d threads to terminate\n", - ksocknal_data.ksnd_nthreads); - read_unlock(&ksocknal_data.ksnd_global_lock); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - read_lock(&ksocknal_data.ksnd_global_lock); - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - kpr_deregister (&ksocknal_data.ksnd_router); - - ksocknal_free_buffers(); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; - /* fall through */ - - case SOCKNAL_INIT_NOTHING: - break; - } - - CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); -} - - -void -ksocknal_init_incarnation (void) -{ - struct timeval tv; - - /* The incarnation number is the time this module loaded and it - * identifies this particular instance of the socknal. Hopefully - * we won't be able to reboot more frequently than 1MHz for the - * forseeable future :) */ - - do_gettimeofday(&tv); - - ksocknal_data.ksnd_incarnation = - (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; -} - -int -ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); - int rc; - int i; - int j; - - LASSERT (nal == &ksocknal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } - - LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); - - memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - - ksocknal_init_incarnation(); - - ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (ksocknal_data.ksnd_peers, - sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); - if (ksocknal_data.ksnd_peers == NULL) - return (-ENOMEM); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); - - rwlock_init(&ksocknal_data.ksnd_global_lock); - - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; - - spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; - - spin_lock_init (&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); - INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); - INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); - init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - - spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock); - INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes); - init_waitqueue_head(&ksocknal_data.ksnd_autoconnectd_waitq); - - /* NB memset above zeros whole of ksocknal_data, including - * ksocknal_data.ksnd_irqinfo[all].ksni_valid */ - - /* flag lists/ptrs/locks initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - - ksocknal_data.ksnd_nschedulers = ksocknal_nsched(); - PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, - sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } - - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; - - spin_lock_init (&kss->kss_lock); - INIT_LIST_HEAD (&kss->kss_rx_conns); - INIT_LIST_HEAD (&kss->kss_tx_conns); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&kss->kss_zctxdone_list); -#endif - init_waitqueue_head (&kss->kss_waitq); - } - - /* NB we have to wait to be told our true NID... */ - process_id.pid = requested_pid; - process_id.nid = 0; - - rc = lib_init(&ksocknal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called - - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - rc = ksocknal_thread_start (ksocknal_scheduler, - &ksocknal_data.ksnd_schedulers[i]); - if (rc != 0) { - CERROR("Can't spawn socknal scheduler[%d]: %d\n", - i, rc); - ksocknal_api_shutdown (nal); - return (rc); - } - } - - for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) { - rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); - if (rc != 0) { - CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - } - - rc = ksocknal_thread_start (ksocknal_reaper, NULL); - if (rc != 0) { - CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - - rc = kpr_register(&ksocknal_data.ksnd_router, - &ksocknal_router_interface); - if (rc != 0) { - CDEBUG(D_NET, "Can't initialise routing interface " - "(rc = %d): not routing\n", rc); - } else { - /* Only allocate forwarding buffers if there's a router */ - - for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb; - ksock_fmb_pool_t *pool; - - - if (i < SOCKNAL_SMALL_FWD_NMSGS) - pool = &ksocknal_data.ksnd_small_fmp; - else - pool = &ksocknal_data.ksnd_large_fmp; - - PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, - fmb_kiov[pool->fmp_buff_pages])); - if (fmb == NULL) { - ksocknal_api_shutdown(nal); - return (-ENOMEM); - } - - fmb->fmb_pool = pool; - - for (j = 0; j < pool->fmp_buff_pages; j++) { - fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); - - if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } - - LASSERT(page_address(fmb->fmb_kiov[j].kiov_page) != NULL); - } - - list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); - } - } - - rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - - /* flag everything initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - - printk(KERN_INFO "Lustre: Routing socket NAL loaded " - "(Routing %s, initial mem %d, incarnation "LPX64")\n", - kpr_routing (&ksocknal_data.ksnd_router) ? - "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation); - - return (0); -} - -void __exit -ksocknal_module_fini (void) -{ -#ifdef CONFIG_SYSCTL - if (ksocknal_tunables.ksnd_sysctl != NULL) - unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); -#endif - PtlNIFini(ksocknal_ni); - - ptl_unregister_nal(SOCKNAL); -} - -int __init -ksocknal_module_init (void) -{ - int rc; - - /* packet descriptor must fit in a router descriptor's scratchpad */ - LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int)); -#if CPU_AFFINITY - LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int)); -#endif -#if SOCKNAL_ZC - LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int)); -#endif - /* check ksnr_connected/connecting field large enough */ - LASSERT(SOCKNAL_CONN_NTYPES <= 4); - - ksocknal_api.nal_ni_init = ksocknal_api_startup; - ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; - ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK; - ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; - ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK; - ksocknal_tunables.ksnd_buffer_size = SOCKNAL_BUFFER_SIZE; - ksocknal_tunables.ksnd_nagle = SOCKNAL_NAGLE; - ksocknal_tunables.ksnd_keepalive_idle = SOCKNAL_KEEPALIVE_IDLE; - ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT; - ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL; -#if CPU_AFFINITY - ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY; -#endif -#if SOCKNAL_ZC - ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif - - rc = ptl_register_nal(SOCKNAL, &ksocknal_api); - if (rc != PTL_OK) { - CERROR("Can't register SOCKNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(SOCKNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - ksocknal_tunables.ksnd_sysctl = - register_sysctl_table (ksocknal_top_ctl_table, 0); -#endif - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init(ksocknal_module_init); -module_exit(ksocknal_module_fini); - diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h deleted file mode 100644 index 9cfe858..0000000 --- a/lustre/portals/knals/socknal/socknal.h +++ /dev/null @@ -1,526 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#define DEBUG_PORTAL_ALLOC -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include -#include -#include -#include - -#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ - -#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -/* default vals for runtime tunables */ -#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ -#define SOCKNAL_EAGER_ACK 0 /* default eager ack (boolean) */ -#define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */ -#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ -#define SOCKNAL_MIN_BULK (1<<10) /* smallest "large" message */ -#define SOCKNAL_BUFFER_SIZE (8<<20) /* default socket buffer size */ -#define SOCKNAL_NAGLE 0 /* enable/disable NAGLE? */ -#define SOCKNAL_IRQ_AFFINITY 1 /* enable/disable IRQ affinity? */ -#define SOCKNAL_KEEPALIVE_IDLE 0 /* # seconds idle before 1st probe */ -#define SOCKNAL_KEEPALIVE_COUNT 10 /* # unanswered probes to determine peer death */ -#define SOCKNAL_KEEPALIVE_INTVL 1 /* seconds between probes */ - -#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ -#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ - -#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ - -#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT) - /* # pages in a large message fwd buffer */ - -#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */ - -#define SOCKNAL_MAX_INTERFACES 16 /* Largest number of interfaces we bind */ - -#define SOCKNAL_ROUND_ROBIN 0 /* round robin / load balance */ - -#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) - -#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ -#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ -#define SOCKNAL_RISK_KMAP_DEADLOCK 0 /* risk kmap deadlock on multi-frag I/O - * (backs off to single-frag if disabled) */ - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -#endif - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_wmem_queued wmem_queued -# define sk_err err -#endif - -typedef struct /* pool of forwarding buffers */ -{ - spinlock_t fmp_lock; /* serialise */ - struct list_head fmp_idle_fmbs; /* free buffers */ - struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ - int fmp_nactive_fmbs; /* # buffers in use */ - int fmp_buff_pages; /* # pages per buffer */ -} ksock_fmb_pool_t; - - -typedef struct /* per scheduler state */ -{ - spinlock_t kss_lock; /* serialise */ - struct list_head kss_rx_conns; /* conn waiting to be read */ - struct list_head kss_tx_conns; /* conn waiting to be written */ -#if SOCKNAL_ZC - struct list_head kss_zctxdone_list; /* completed ZC transmits */ -#endif - wait_queue_head_t kss_waitq; /* where scheduler sleeps */ - int kss_nconns; /* # connections assigned to this scheduler */ -} ksock_sched_t; - -typedef struct -{ - int ksni_valid:1; /* been set yet? */ - int ksni_bound:1; /* bound to a cpu yet? */ - int ksni_sched:6; /* which scheduler (assumes < 64) */ -} ksock_irqinfo_t; - -typedef struct -{ - __u32 ksni_ipaddr; /* interface's IP address */ - __u32 ksni_netmask; /* interface's network mask */ - int ksni_nroutes; /* # routes using (active) */ - int ksni_npeers; /* # peers using (passive) */ -} ksock_interface_t; - -typedef struct -{ - int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ - int ksnd_eager_ack; /* make TCP ack eagerly? */ - int ksnd_typed_conns; /* drive sockets by type? */ - int ksnd_min_bulk; /* smallest "large" message */ - int ksnd_buffer_size; /* socket buffer size */ - int ksnd_nagle; /* enable NAGLE? */ - int ksnd_irq_affinity; /* enable IRQ affinity? */ - int ksnd_keepalive_idle; /* # idle secs before 1st probe */ - int ksnd_keepalive_count; /* # probes */ - int ksnd_keepalive_intvl; /* time between probes */ -#if SOCKNAL_ZC - unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ -#endif - struct ctl_table_header *ksnd_sysctl; /* sysctl interface */ -} ksock_tunables_t; - -typedef struct -{ - int ksnd_init; /* initialisation state */ - __u64 ksnd_incarnation; /* my epoch */ - - rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ - struct list_head *ksnd_peers; /* hash table of all my known peers */ - int ksnd_peer_hash_size; /* size of ksnd_peers */ - - int ksnd_nthreads; /* # live threads */ - int ksnd_shuttingdown; /* tell threads to exit */ - int ksnd_nschedulers; /* # schedulers */ - ksock_sched_t *ksnd_schedulers; /* their state */ - - atomic_t ksnd_npeers; /* total # peers extant */ - atomic_t ksnd_nclosing_conns; /* # closed conns extant */ - - kpr_router_t ksnd_router; /* THE router */ - - ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ - ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ - - atomic_t ksnd_nactive_ltxs; /* #active ltxs */ - - struct list_head ksnd_deathrow_conns; /* conns to be closed */ - struct list_head ksnd_zombie_conns; /* conns to be freed */ - struct list_head ksnd_enomem_conns; /* conns to be retried */ - wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ - unsigned long ksnd_reaper_waketime; /* when reaper will wake */ - spinlock_t ksnd_reaper_lock; /* serialise */ - - int ksnd_enomem_tx; /* test ENOMEM sender */ - int ksnd_stall_tx; /* test sluggish sender */ - int ksnd_stall_rx; /* test sluggish receiver */ - - struct list_head ksnd_autoconnectd_routes; /* routes waiting to be connected */ - wait_queue_head_t ksnd_autoconnectd_waitq; /* autoconnectds sleep here */ - spinlock_t ksnd_autoconnectd_lock; /* serialise */ - - ksock_irqinfo_t ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */ - - int ksnd_ninterfaces; - ksock_interface_t ksnd_interfaces[SOCKNAL_MAX_INTERFACES]; /* published interfaces */ -} ksock_nal_data_t; - -#define SOCKNAL_INIT_NOTHING 0 -#define SOCKNAL_INIT_DATA 1 -#define SOCKNAL_INIT_LIB 2 -#define SOCKNAL_INIT_ALL 3 - -/* A packet just assembled for transmission is represented by 1 or more - * struct iovec fragments (the first frag contains the portals header), - * followed by 0 or more ptl_kiov_t fragments. - * - * On the receive side, initially 1 struct iovec fragment is posted for - * receive (the header). Once the header has been received, the payload is - * received into either struct iovec or ptl_kiov_t fragments, depending on - * what the header matched or whether the message needs forwarding. */ - -struct ksock_conn; /* forward ref */ -struct ksock_peer; /* forward ref */ -struct ksock_route; /* forward ref */ - -typedef struct /* transmit packet */ -{ - struct list_head tx_list; /* queue on conn for transmission etc */ - char tx_isfwd; /* forwarding / sourced here */ - int tx_nob; /* # packet bytes */ - int tx_resid; /* residual bytes */ - int tx_niov; /* # packet iovec frags */ - struct iovec *tx_iov; /* packet iovec frags */ - int tx_nkiov; /* # packet page frags */ - ptl_kiov_t *tx_kiov; /* packet page frags */ - struct ksock_conn *tx_conn; /* owning conn */ - ptl_hdr_t *tx_hdr; /* packet header (for debug only) */ -#if SOCKNAL_ZC - zccd_t tx_zccd; /* zero copy callback descriptor */ -#endif -} ksock_tx_t; - -typedef struct /* forwarded packet */ -{ - ksock_tx_t ftx_tx; /* send info */ - struct iovec ftx_iov; /* hdr iovec */ -} ksock_ftx_t; - -#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) -/* network zero copy callback descriptor embedded in ksock_tx_t */ - -typedef struct /* locally transmitted packet */ -{ - ksock_tx_t ltx_tx; /* send info */ - void *ltx_private; /* lib_finalize() callback arg */ - void *ltx_cookie; /* lib_finalize() callback arg */ - ptl_hdr_t ltx_hdr; /* buffer for packet header */ - int ltx_desc_size; /* bytes allocated for this desc */ - struct iovec ltx_iov[1]; /* iov for hdr + payload */ - ptl_kiov_t ltx_kiov[0]; /* kiov for payload */ -} ksock_ltx_t; - -#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) -/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ - -#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) -/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ - -/* NB list_entry() is used here as convenient macro for calculating a - * pointer to a struct from the address of a member. */ - -typedef struct /* Kernel portals Socket Forwarding message buffer */ -{ /* (socknal->router) */ - struct list_head fmb_list; /* queue idle */ - kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ - ksock_fmb_pool_t *fmb_pool; /* owning pool */ - struct ksock_peer *fmb_peer; /* peer received from */ - ptl_hdr_t fmb_hdr; /* message header */ - ptl_kiov_t fmb_kiov[0]; /* payload frags */ -} ksock_fmb_t; - -/* space for the rx frag descriptors; we either read a single contiguous - * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */ -typedef union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; -} ksock_rxiovspace_t; - -#define SOCKNAL_RX_HEADER 1 /* reading header */ -#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ -#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ -#define SOCKNAL_RX_SLOP 4 /* skipping body */ -#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ -#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ - -typedef struct ksock_conn -{ - struct ksock_peer *ksnc_peer; /* owning peer */ - struct ksock_route *ksnc_route; /* owning route */ - struct list_head ksnc_list; /* stash on peer's conn list */ - struct socket *ksnc_sock; /* actual socket */ - void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ - void *ksnc_saved_write_space; /* socket's original write_space() callback */ - atomic_t ksnc_refcount; /* # users */ - ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ - __u32 ksnc_myipaddr; /* my IP */ - __u32 ksnc_ipaddr; /* peer's IP */ - int ksnc_port; /* peer's port */ - int ksnc_closing; /* being shut down */ - int ksnc_type; /* type of connection */ - __u64 ksnc_incarnation; /* peer's incarnation */ - - /* reader */ - struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ - unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ - int ksnc_rx_started; /* started receiving a message */ - int ksnc_rx_ready; /* data ready to read */ - int ksnc_rx_scheduled; /* being progressed */ - int ksnc_rx_state; /* what is being read */ - int ksnc_rx_nob_left; /* # bytes to next hdr/body */ - int ksnc_rx_nob_wanted; /* bytes actually wanted */ - int ksnc_rx_niov; /* # iovec frags */ - struct iovec *ksnc_rx_iov; /* the iovec frags */ - int ksnc_rx_nkiov; /* # page frags */ - ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ - ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ - void *ksnc_cookie; /* rx lib_finalize passthru arg */ - ptl_hdr_t ksnc_hdr; /* where I read headers into */ - - /* WRITER */ - struct list_head ksnc_tx_list; /* where I enq waiting for output space */ - struct list_head ksnc_tx_queue; /* packets waiting to be sent */ - unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ - int ksnc_tx_bufnob; /* send buffer marker */ - atomic_t ksnc_tx_nob; /* # bytes queued */ - int ksnc_tx_ready; /* write space */ - int ksnc_tx_scheduled; /* being progressed */ - -#if !SOCKNAL_SINGLE_FRAG_RX - struct iovec ksnc_rx_scratch_iov[PTL_MD_MAX_IOV]; -#endif -#if !SOCKNAL_SINGLE_FRAG_TX - struct iovec ksnc_tx_scratch_iov[PTL_MD_MAX_IOV]; -#endif -} ksock_conn_t; - -#define KSNR_TYPED_ROUTES ((1 << SOCKNAL_CONN_CONTROL) | \ - (1 << SOCKNAL_CONN_BULK_IN) | \ - (1 << SOCKNAL_CONN_BULK_OUT)) - -typedef struct ksock_route -{ - struct list_head ksnr_list; /* chain on peer route list */ - struct list_head ksnr_connect_list; /* chain on autoconnect list */ - struct ksock_peer *ksnr_peer; /* owning peer */ - atomic_t ksnr_refcount; /* # users */ - unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ - unsigned int ksnr_retry_interval; /* how long between retries */ - __u32 ksnr_myipaddr; /* my IP */ - __u32 ksnr_ipaddr; /* IP address to connect to */ - int ksnr_port; /* port to connect to */ - unsigned int ksnr_connecting:4; /* autoconnects in progress by type */ - unsigned int ksnr_connected:4; /* connections established by type */ - unsigned int ksnr_deleted:1; /* been removed from peer? */ - unsigned int ksnr_share_count; /* created explicitly? */ - int ksnr_conn_count; /* # conns established by this route */ -} ksock_route_t; - -typedef struct ksock_peer -{ - struct list_head ksnp_list; /* stash on global peer list */ - ptl_nid_t ksnp_nid; /* who's on the other end(s) */ - atomic_t ksnp_refcount; /* # users */ - int ksnp_sharecount; /* lconf usage counter */ - int ksnp_closing; /* being closed */ - int ksnp_error; /* errno on closing last conn */ - struct list_head ksnp_conns; /* all active connections */ - struct list_head ksnp_routes; /* routes */ - struct list_head ksnp_tx_queue; /* waiting packets */ - unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ - int ksnp_n_passive_ips; /* # of... */ - __u32 ksnp_passive_ips[SOCKNAL_MAX_INTERFACES]; /* preferred local interfaces */ -} ksock_peer_t; - - -extern lib_nal_t ksocknal_lib; -extern ksock_nal_data_t ksocknal_data; -extern ksock_tunables_t ksocknal_tunables; - -static inline struct list_head * -ksocknal_nid2peerlist (ptl_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; - - return (&ksocknal_data.ksnd_peers [hash]); -} - -static inline int -ksocknal_getconnsock (ksock_conn_t *conn) -{ - int rc = -ESHUTDOWN; - - read_lock (&ksocknal_data.ksnd_global_lock); - if (!conn->ksnc_closing) { - rc = 0; - get_file (conn->ksnc_sock->file); - } - read_unlock (&ksocknal_data.ksnd_global_lock); - - return (rc); -} - -static inline void -ksocknal_putconnsock (ksock_conn_t *conn) -{ - fput (conn->ksnc_sock->file); -} - -#ifndef CONFIG_SMP -static inline -int ksocknal_nsched(void) -{ - return 1; -} -#else -#include -# if !(defined(CONFIG_X86) && (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,21))) || defined(CONFIG_X86_64) || (LUSTRE_KERNEL_VERSION < 39) || ((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) && !defined(CONFIG_X86_HT)) -static inline int -ksocknal_nsched(void) -{ - return num_online_cpus(); -} - -static inline int -ksocknal_sched2cpu(int i) -{ - return i; -} - -static inline int -ksocknal_irqsched2cpu(int i) -{ - return i; -} -# else -static inline int -ksocknal_nsched(void) -{ - if (smp_num_siblings == 1) - return (num_online_cpus()); - - /* We need to know if this assumption is crap */ - LASSERT (smp_num_siblings == 2); - return (num_online_cpus()/2); -} - -static inline int -ksocknal_sched2cpu(int i) -{ - if (smp_num_siblings == 1) - return i; - - return (i * 2); -} - -static inline int -ksocknal_irqsched2cpu(int i) -{ - return (ksocknal_sched2cpu(i) + 1); -} -# endif -#endif - -extern void ksocknal_put_route (ksock_route_t *route); -extern void ksocknal_put_peer (ksock_peer_t *peer); -extern ksock_peer_t *ksocknal_find_peer_locked (ptl_nid_t nid); -extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid); -extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, - int single, int keep_conn); -extern int ksocknal_create_conn (ksock_route_t *route, - struct socket *sock, int type); -extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); -extern void ksocknal_terminate_conn (ksock_conn_t *conn); -extern void ksocknal_destroy_conn (ksock_conn_t *conn); -extern void ksocknal_put_conn (ksock_conn_t *conn); -extern int ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation); -extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why); -extern int ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr); - -extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); -extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); -extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void ksocknal_fmb_callback (void *arg, int error); -extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive); -extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); -extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); -extern int ksocknal_scheduler (void *arg); -extern void ksocknal_data_ready(struct sock *sk, int n); -extern void ksocknal_write_space(struct sock *sk); -extern int ksocknal_autoconnectd (void *arg); -extern int ksocknal_reaper (void *arg); -extern int ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, - int *rxmem, int *nagle); -extern int ksocknal_setup_sock (struct socket *sock); -extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs); -extern int ksocknal_recv_hello (ksock_conn_t *conn, - ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs); diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c deleted file mode 100644 index ed91f94..0000000 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ /dev/null @@ -1,2934 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "socknal.h" -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -# include -#endif - -/* - * LIB functions follow - * - */ -int -ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if ksocknal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if (nal->libnal_ni.ni_pid.nid == nid) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -void -ksocknal_free_ltx (ksock_ltx_t *ltx) -{ - atomic_dec(&ksocknal_data.ksnd_nactive_ltxs); - PORTAL_FREE(ltx, ltx->ltx_desc_size); -} - -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) -struct page * -ksocknal_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); - - if (page == NULL || - !VALID_PAGE (page)) - return (NULL); - - return (page); -} -#endif - -int -ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) -{ - struct socket *sock = conn->ksnc_sock; - struct iovec *iov = tx->tx_iov; -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - unsigned long vaddr = (unsigned long)iov->iov_base - int offset = vaddr & (PAGE_SIZE - 1); - int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset); - struct page *page; -#endif - int nob; - int rc; - - /* NB we can't trust socket ops to either consume our iovs - * or leave them alone. */ - LASSERT (tx->tx_niov > 0); - -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - if (zcsize >= ksocknal_data.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", - (void *)vaddr, page, page_address(page), offset, zcsize); - - if (!list_empty (&conn->ksnc_tx_queue) || - zcsize < tx->tx_resid) - msgflg |= MSG_MORE; - - rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd); - } else -#endif - { -#if SOCKNAL_SINGLE_FRAG_TX - struct iovec scratch; - struct iovec *scratchiov = &scratch; - int niov = 1; -#else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_niov; -#endif - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - mm_segment_t oldmm = get_fs(); - int i; - - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = tx->tx_iov[i]; - nob += scratchiov[i].iov_len; - } - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - set_fs (KERNEL_DS); - rc = sock_sendmsg(sock, &msg, nob); - set_fs (oldmm); - } - - if (rc <= 0) /* sent nothing? */ - return (rc); - - nob = rc; - LASSERT (nob <= tx->tx_resid); - tx->tx_resid -= nob; - - /* "consume" iov */ - do { - LASSERT (tx->tx_niov > 0); - - if (nob < iov->iov_len) { - iov->iov_base = (void *)(((unsigned long)(iov->iov_base)) + nob); - iov->iov_len -= nob; - return (rc); - } - - nob -= iov->iov_len; - tx->tx_iov = ++iov; - tx->tx_niov--; - } while (nob != 0); - - return (rc); -} - -int -ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) -{ - struct socket *sock = conn->ksnc_sock; - ptl_kiov_t *kiov = tx->tx_kiov; - int rc; - int nob; - - /* NB we can't trust socket ops to either consume our iovs - * or leave them alone. */ - LASSERT (tx->tx_niov == 0); - LASSERT (tx->tx_nkiov > 0); - -#if SOCKNAL_ZC - if (kiov->kiov_len >= ksocknal_tunables.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { - struct page *page = kiov->kiov_page; - int offset = kiov->kiov_offset; - int fragsize = kiov->kiov_len; - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "page %p + offset %x for %d\n", - page, offset, kiov->kiov_len); - - if (!list_empty(&conn->ksnc_tx_queue) || - fragsize < tx->tx_resid) - msgflg |= MSG_MORE; - - rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg, - &tx->tx_zccd); - } else -#endif - { -#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct iovec scratch; - struct iovec *scratchiov = &scratch; - int niov = 1; -#else -#warning "XXX risk of kmap deadlock on multiple frags..." - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_nkiov; -#endif - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - mm_segment_t oldmm = get_fs(); - int i; - - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + - kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - } - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_DONTWAIT; - - set_fs (KERNEL_DS); - rc = sock_sendmsg(sock, &msg, nob); - set_fs (oldmm); - - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - } - - if (rc <= 0) /* sent nothing? */ - return (rc); - - nob = rc; - LASSERT (nob <= tx->tx_resid); - tx->tx_resid -= nob; - - do { - LASSERT(tx->tx_nkiov > 0); - - if (nob < kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; - return rc; - } - - nob -= kiov->kiov_len; - tx->tx_kiov = ++kiov; - tx->tx_nkiov--; - } while (nob != 0); - - return (rc); -} - -int -ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) -{ - int rc; - int bufnob; - - if (ksocknal_data.ksnd_stall_tx != 0) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (ksocknal_data.ksnd_stall_tx * HZ); - } - - LASSERT (tx->tx_resid != 0); - - rc = ksocknal_getconnsock (conn); - if (rc != 0) { - LASSERT (conn->ksnc_closing); - return (-ESHUTDOWN); - } - - do { - if (ksocknal_data.ksnd_enomem_tx > 0) { - /* testing... */ - ksocknal_data.ksnd_enomem_tx--; - rc = -EAGAIN; - } else if (tx->tx_niov != 0) { - rc = ksocknal_send_iov (conn, tx); - } else { - rc = ksocknal_send_kiov (conn, tx); - } - - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - if (rc > 0) /* sent something? */ - conn->ksnc_tx_bufnob += rc; /* account it */ - - if (bufnob < conn->ksnc_tx_bufnob) { - /* allocated send buffer bytes < computed; infer - * something got ACKed */ - conn->ksnc_tx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_tx_bufnob = bufnob; - mb(); - } - - if (rc <= 0) { /* Didn't write anything? */ - unsigned long flags; - ksock_sched_t *sched; - - if (rc == 0) /* some stacks return 0 instead of -EAGAIN */ - rc = -EAGAIN; - - if (rc != -EAGAIN) - break; - - /* Check if EAGAIN is due to memory pressure */ - - sched = conn->ksnc_scheduler; - spin_lock_irqsave(&sched->kss_lock, flags); - - if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && - !conn->ksnc_tx_ready) { - /* SOCK_NOSPACE is set when the socket fills - * and cleared in the write_space callback - * (which also sets ksnc_tx_ready). If - * SOCK_NOSPACE and ksnc_tx_ready are BOTH - * zero, I didn't fill the socket and - * write_space won't reschedule me, so I - * return -ENOMEM to get my caller to retry - * after a timeout */ - rc = -ENOMEM; - } - - spin_unlock_irqrestore(&sched->kss_lock, flags); - break; - } - - /* socket's wmem_queued now includes 'rc' bytes */ - atomic_sub (rc, &conn->ksnc_tx_nob); - rc = 0; - - } while (tx->tx_resid != 0); - - ksocknal_putconnsock (conn); - return (rc); -} - -void -ksocknal_eager_ack (ksock_conn_t *conn) -{ - int opt = 1; - mm_segment_t oldmm = get_fs(); - struct socket *sock = conn->ksnc_sock; - - /* Remind the socket to ACK eagerly. If I don't, the socket might - * think I'm about to send something it could piggy-back the ACK - * on, introducing delay in completing zero-copy sends in my - * peer. */ - - set_fs(KERNEL_DS); - sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, - (char *)&opt, sizeof (opt)); - set_fs(oldmm); -} - -int -ksocknal_recv_iov (ksock_conn_t *conn) -{ -#if SOCKNAL_SINGLE_FRAG_RX - struct iovec scratch; - struct iovec *scratchiov = &scratch; - int niov = 1; -#else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_niov; -#endif - struct iovec *iov = conn->ksnc_rx_iov; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - mm_segment_t oldmm = get_fs(); - int nob; - int i; - int rc; - - /* NB we can't trust socket ops to either consume our iovs - * or leave them alone. */ - LASSERT (niov > 0); - - for (nob = i = 0; i < niov; i++) { - scratchiov[i] = iov[i]; - nob += scratchiov[i].iov_len; - } - LASSERT (nob <= conn->ksnc_rx_nob_wanted); - - set_fs (KERNEL_DS); - rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); - /* NB this is just a boolean..........................^ */ - set_fs (oldmm); - - if (rc <= 0) - return (rc); - - /* received something... */ - nob = rc; - - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_rx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - mb(); /* order with setting rx_started */ - conn->ksnc_rx_started = 1; - - conn->ksnc_rx_nob_wanted -= nob; - conn->ksnc_rx_nob_left -= nob; - - do { - LASSERT (conn->ksnc_rx_niov > 0); - - if (nob < iov->iov_len) { - iov->iov_len -= nob; - iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); - return (-EAGAIN); - } - - nob -= iov->iov_len; - conn->ksnc_rx_iov = ++iov; - conn->ksnc_rx_niov--; - } while (nob != 0); - - return (rc); -} - -int -ksocknal_recv_kiov (ksock_conn_t *conn) -{ -#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct iovec scratch; - struct iovec *scratchiov = &scratch; - int niov = 1; -#else -#warning "XXX risk of kmap deadlock on multiple frags..." - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_nkiov; -#endif - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - mm_segment_t oldmm = get_fs(); - int nob; - int i; - int rc; - - LASSERT (conn->ksnc_rx_nkiov > 0); - - /* NB we can't trust socket ops to either consume our iovs - * or leave them alone. */ - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; - } - LASSERT (nob <= conn->ksnc_rx_nob_wanted); - - set_fs (KERNEL_DS); - rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); - /* NB this is just a boolean.......................^ */ - set_fs (oldmm); - - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - - if (rc <= 0) - return (rc); - - /* received something... */ - nob = rc; - - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_rx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - mb(); /* order with setting rx_started */ - conn->ksnc_rx_started = 1; - - conn->ksnc_rx_nob_wanted -= nob; - conn->ksnc_rx_nob_left -= nob; - - do { - LASSERT (conn->ksnc_rx_nkiov > 0); - - if (nob < kiov->kiov_len) { - kiov->kiov_offset += nob; - kiov->kiov_len -= nob; - return -EAGAIN; - } - - nob -= kiov->kiov_len; - conn->ksnc_rx_kiov = ++kiov; - conn->ksnc_rx_nkiov--; - } while (nob != 0); - - return 1; -} - -int -ksocknal_receive (ksock_conn_t *conn) -{ - /* Return 1 on success, 0 on EOF, < 0 on error. - * Caller checks ksnc_rx_nob_wanted to determine - * progress/completion. */ - int rc; - ENTRY; - - if (ksocknal_data.ksnd_stall_rx != 0) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (ksocknal_data.ksnd_stall_rx * HZ); - } - - rc = ksocknal_getconnsock (conn); - if (rc != 0) { - LASSERT (conn->ksnc_closing); - return (-ESHUTDOWN); - } - - for (;;) { - if (conn->ksnc_rx_niov != 0) - rc = ksocknal_recv_iov (conn); - else - rc = ksocknal_recv_kiov (conn); - - if (rc <= 0) { - /* error/EOF or partial receive */ - if (rc == -EAGAIN) { - rc = 1; - } else if (rc == 0 && conn->ksnc_rx_started) { - /* EOF in the middle of a message */ - rc = -EPROTO; - } - break; - } - - /* Completed a fragment */ - - if (conn->ksnc_rx_nob_wanted == 0) { - /* Completed a message segment (header or payload) */ - if ((ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 && - (conn->ksnc_rx_state == SOCKNAL_RX_BODY || - conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { - /* Remind the socket to ack eagerly... */ - ksocknal_eager_ack(conn); - } - rc = 1; - break; - } - } - - ksocknal_putconnsock (conn); - RETURN (rc); -} - -#if SOCKNAL_ZC -void -ksocknal_zc_callback (zccd_t *zcd) -{ - ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); - ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler; - unsigned long flags; - ENTRY; - - /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ - - spin_lock_irqsave (&sched->kss_lock, flags); - - list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); - wake_up (&sched->kss_waitq); - - spin_unlock_irqrestore (&sched->kss_lock, flags); - EXIT; -} -#endif - -void -ksocknal_tx_done (ksock_tx_t *tx, int asynch) -{ - ksock_ltx_t *ltx; - ENTRY; - - if (tx->tx_conn != NULL) { -#if SOCKNAL_ZC - /* zero copy completion isn't always from - * process_transmit() so it needs to keep a ref on - * tx_conn... */ - if (asynch) - ksocknal_put_conn (tx->tx_conn); -#else - LASSERT (!asynch); -#endif - } - - if (tx->tx_isfwd) { /* was a forwarded packet? */ - kpr_fwd_done (&ksocknal_data.ksnd_router, - KSOCK_TX_2_KPR_FWD_DESC (tx), - (tx->tx_resid == 0) ? 0 : -ECONNABORTED); - EXIT; - return; - } - - /* local send */ - ltx = KSOCK_TX_2_KSOCK_LTX (tx); - - lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie, - (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL); - - ksocknal_free_ltx (ltx); - EXIT; -} - -void -ksocknal_tx_launched (ksock_tx_t *tx) -{ -#if SOCKNAL_ZC - if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - ksock_conn_t *conn = tx->tx_conn; - - /* zccd skbufs are still in-flight. First take a ref on - * conn, so it hangs about for ksocknal_tx_done... */ - atomic_inc (&conn->ksnc_refcount); - - /* ...then drop the initial ref on zccd, so the zero copy - * callback can occur */ - zccd_put (&tx->tx_zccd); - return; - } -#endif - /* Any zero-copy-ness (if any) has completed; I can complete the - * transmit now, avoiding an extra schedule */ - ksocknal_tx_done (tx, 0); -} - -int -ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) -{ - unsigned long flags; - int rc; - - rc = ksocknal_transmit (conn, tx); - - CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); - - if (tx->tx_resid == 0) { - /* Sent everything OK */ - LASSERT (rc == 0); - - ksocknal_tx_launched (tx); - return (0); - } - - if (rc == -EAGAIN) - return (rc); - - if (rc == -ENOMEM) { - static int counter; - - counter++; /* exponential backoff warnings */ - if ((counter & (-counter)) == counter) - CWARN("%d ENOMEM tx %p\n", counter, conn); - - /* Queue on ksnd_enomem_conns for retry after a timeout */ - spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags); - - /* enomem list takes over scheduler's ref... */ - LASSERT (conn->ksnc_tx_scheduled); - list_add_tail(&conn->ksnc_tx_list, - &ksocknal_data.ksnd_enomem_conns); - if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY, - ksocknal_data.ksnd_reaper_waketime)) - wake_up (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags); - return (rc); - } - - /* Actual error */ - LASSERT (rc < 0); - - if (!conn->ksnc_closing) - CERROR("[%p] Error %d on write to "LPX64 - " ip %d.%d.%d.%d:%d\n", conn, rc, - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - - ksocknal_close_conn_and_siblings (conn, rc); - ksocknal_tx_launched (tx); - - return (rc); -} - -void -ksocknal_launch_autoconnect_locked (ksock_route_t *route) -{ - unsigned long flags; - - /* called holding write lock on ksnd_global_lock */ - - LASSERT (!route->ksnr_deleted); - LASSERT ((route->ksnr_connected & (1 << SOCKNAL_CONN_ANY)) == 0); - LASSERT ((route->ksnr_connected & KSNR_TYPED_ROUTES) != KSNR_TYPED_ROUTES); - LASSERT (route->ksnr_connecting == 0); - - if (ksocknal_tunables.ksnd_typed_conns) - route->ksnr_connecting = - KSNR_TYPED_ROUTES & ~route->ksnr_connected; - else - route->ksnr_connecting = (1 << SOCKNAL_CONN_ANY); - - atomic_inc (&route->ksnr_refcount); /* extra ref for asynchd */ - - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); - - list_add_tail (&route->ksnr_connect_list, - &ksocknal_data.ksnd_autoconnectd_routes); - wake_up (&ksocknal_data.ksnd_autoconnectd_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); -} - -ksock_peer_t * -ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) -{ - char ipbuf[PTL_NALFMT_SIZE]; - ptl_nid_t target_nid; - int rc; - ksock_peer_t *peer = ksocknal_find_peer_locked (nid); - - if (peer != NULL) - return (peer); - - if (tx->tx_isfwd) { - CERROR ("Can't send packet to "LPX64 - " %s: routed target is not a peer\n", - nid, portals_nid2str(SOCKNAL, nid, ipbuf)); - return (NULL); - } - - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, - &target_nid); - if (rc != 0) { - CERROR ("Can't route to "LPX64" %s: router error %d\n", - nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc); - return (NULL); - } - - peer = ksocknal_find_peer_locked (target_nid); - if (peer != NULL) - return (peer); - - CERROR ("Can't send packet to "LPX64" %s: no peer entry\n", - target_nid, portals_nid2str(SOCKNAL, target_nid, ipbuf)); - return (NULL); -} - -ksock_conn_t * -ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) -{ - struct list_head *tmp; - ksock_conn_t *typed = NULL; - int tnob = 0; - ksock_conn_t *fallback = NULL; - int fnob = 0; - ksock_conn_t *conn; - - list_for_each (tmp, &peer->ksnp_conns) { - ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); -#if SOCKNAL_ROUND_ROBIN - const int nob = 0; -#else - int nob = atomic_read(&c->ksnc_tx_nob) + - c->ksnc_sock->sk->sk_wmem_queued; -#endif - LASSERT (!c->ksnc_closing); - - if (fallback == NULL || nob < fnob) { - fallback = c; - fnob = nob; - } - - if (!ksocknal_tunables.ksnd_typed_conns) - continue; - - switch (c->ksnc_type) { - default: - LBUG(); - case SOCKNAL_CONN_ANY: - break; - case SOCKNAL_CONN_BULK_IN: - continue; - case SOCKNAL_CONN_BULK_OUT: - if (tx->tx_nob < ksocknal_tunables.ksnd_min_bulk) - continue; - break; - case SOCKNAL_CONN_CONTROL: - if (tx->tx_nob >= ksocknal_tunables.ksnd_min_bulk) - continue; - break; - } - - if (typed == NULL || nob < tnob) { - typed = c; - tnob = nob; - } - } - - /* prefer the typed selection */ - conn = (typed != NULL) ? typed : fallback; - -#if SOCKNAL_ROUND_ROBIN - if (conn != NULL) { - /* round-robin all else being equal */ - list_del (&conn->ksnc_list); - list_add_tail (&conn->ksnc_list, &peer->ksnp_conns); - } -#endif - return conn; -} - -void -ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) -{ - unsigned long flags; - ksock_sched_t *sched = conn->ksnc_scheduler; - - /* called holding global lock (read or irq-write) and caller may - * not have dropped this lock between finding conn and calling me, - * so we don't need the {get,put}connsock dance to deref - * ksnc_sock... */ - LASSERT(!conn->ksnc_closing); - LASSERT(tx->tx_resid == tx->tx_nob); - - CDEBUG (D_NET, "Sending to "LPX64" ip %d.%d.%d.%d:%d\n", - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - - atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); - tx->tx_conn = conn; - -#if SOCKNAL_ZC - zccd_init (&tx->tx_zccd, ksocknal_zc_callback); - /* NB this sets 1 ref on zccd, so the callback can only occur after - * I've released this ref. */ -#endif - spin_lock_irqsave (&sched->kss_lock, flags); - - if (list_empty(&conn->ksnc_tx_queue) && - conn->ksnc_sock->sk->sk_wmem_queued == 0) { - /* First packet starts the timeout */ - conn->ksnc_tx_deadline = jiffies + - ksocknal_tunables.ksnd_io_timeout * HZ; - conn->ksnc_tx_bufnob = 0; - mb(); /* order with adding to tx_queue */ - } - - list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); - - if (conn->ksnc_tx_ready && /* able to send */ - !conn->ksnc_tx_scheduled) { /* not scheduled to send */ - /* +1 ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); - list_add_tail (&conn->ksnc_tx_list, - &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - wake_up (&sched->kss_waitq); - } - - spin_unlock_irqrestore (&sched->kss_lock, flags); -} - -ksock_route_t * -ksocknal_find_connectable_route_locked (ksock_peer_t *peer) -{ - struct list_head *tmp; - ksock_route_t *route; - int bits; - - list_for_each (tmp, &peer->ksnp_routes) { - route = list_entry (tmp, ksock_route_t, ksnr_list); - bits = route->ksnr_connected; - - /* All typed connections established? */ - if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES) - continue; - - /* Untyped connection established? */ - if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) - continue; - - /* connection being established? */ - if (route->ksnr_connecting != 0) - continue; - - /* too soon to retry this guy? */ - if (!time_after_eq (jiffies, route->ksnr_timeout)) - continue; - - return (route); - } - - return (NULL); -} - -ksock_route_t * -ksocknal_find_connecting_route_locked (ksock_peer_t *peer) -{ - struct list_head *tmp; - ksock_route_t *route; - - list_for_each (tmp, &peer->ksnp_routes) { - route = list_entry (tmp, ksock_route_t, ksnr_list); - - if (route->ksnr_connecting != 0) - return (route); - } - - return (NULL); -} - -int -ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) -{ - unsigned long flags; - ksock_peer_t *peer; - ksock_conn_t *conn; - ksock_route_t *route; - rwlock_t *g_lock; - - /* Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete portals header. */ - LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + - lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); - LASSERT (tx->tx_niov >= 1); - LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); - - CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n", - tx, ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, - tx->tx_nob, tx->tx_niov, tx->tx_nkiov); - - tx->tx_conn = NULL; /* only set when assigned a conn */ - tx->tx_resid = tx->tx_nob; - tx->tx_hdr = (ptl_hdr_t *)tx->tx_iov[0].iov_base; - - g_lock = &ksocknal_data.ksnd_global_lock; -#if !SOCKNAL_ROUND_ROBIN - read_lock (g_lock); - - peer = ksocknal_find_target_peer_locked (tx, nid); - if (peer == NULL) { - read_unlock (g_lock); - return (-EHOSTUNREACH); - } - - if (ksocknal_find_connectable_route_locked(peer) == NULL) { - conn = ksocknal_find_conn_locked (tx, peer); - if (conn != NULL) { - /* I've got no autoconnect routes that need to be - * connecting and I do have an actual connection... */ - ksocknal_queue_tx_locked (tx, conn); - read_unlock (g_lock); - return (0); - } - } - - /* I'll need a write lock... */ - read_unlock (g_lock); -#endif - write_lock_irqsave(g_lock, flags); - - peer = ksocknal_find_target_peer_locked (tx, nid); - if (peer == NULL) { - write_unlock_irqrestore(g_lock, flags); - return (-EHOSTUNREACH); - } - - for (;;) { - /* launch any/all autoconnections that need it */ - route = ksocknal_find_connectable_route_locked (peer); - if (route == NULL) - break; - - ksocknal_launch_autoconnect_locked (route); - } - - conn = ksocknal_find_conn_locked (tx, peer); - if (conn != NULL) { - /* Connection exists; queue message on it */ - ksocknal_queue_tx_locked (tx, conn); - write_unlock_irqrestore (g_lock, flags); - return (0); - } - - route = ksocknal_find_connecting_route_locked (peer); - if (route != NULL) { - /* At least 1 connection is being established; queue the - * message... */ - list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); - write_unlock_irqrestore (g_lock, flags); - return (0); - } - - write_unlock_irqrestore (g_lock, flags); - return (-EHOSTUNREACH); -} - -ptl_err_t -ksocknal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - ksock_ltx_t *ltx; - int desc_size; - int rc; - - /* NB 'private' is different depending on what we're sending. - * Just ignore it... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); - - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* It must be OK to kmap() if required */ - LASSERT (payload_kiov == NULL || !in_interrupt ()); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - - if (payload_iov != NULL) - desc_size = offsetof(ksock_ltx_t, ltx_iov[1 + payload_niov]); - else - desc_size = offsetof(ksock_ltx_t, ltx_kiov[payload_niov]); - - if (in_interrupt() || - type == PTL_MSG_ACK || - type == PTL_MSG_REPLY) { - /* Can't block if in interrupt or responding to an incoming - * message */ - PORTAL_ALLOC_ATOMIC(ltx, desc_size); - } else { - PORTAL_ALLOC(ltx, desc_size); - } - - if (ltx == NULL) { - CERROR("Can't allocate tx desc type %d size %d %s\n", - type, desc_size, in_interrupt() ? "(intr)" : ""); - return (PTL_NO_SPACE); - } - - atomic_inc(&ksocknal_data.ksnd_nactive_ltxs); - - ltx->ltx_desc_size = desc_size; - - /* We always have 1 mapped frag for the header */ - ltx->ltx_tx.tx_iov = ltx->ltx_iov; - ltx->ltx_iov[0].iov_base = <x->ltx_hdr; - ltx->ltx_iov[0].iov_len = sizeof(*hdr); - ltx->ltx_hdr = *hdr; - - ltx->ltx_private = private; - ltx->ltx_cookie = cookie; - - ltx->ltx_tx.tx_isfwd = 0; - ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_nob; - - if (payload_iov != NULL) { - /* payload is all mapped */ - ltx->ltx_tx.tx_kiov = NULL; - ltx->ltx_tx.tx_nkiov = 0; - - ltx->ltx_tx.tx_niov = - 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1], - payload_niov, payload_iov, - payload_offset, payload_nob); - } else { - /* payload is all pages */ - ltx->ltx_tx.tx_niov = 1; - - ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; - ltx->ltx_tx.tx_nkiov = - lib_extract_kiov(payload_niov, ltx->ltx_kiov, - payload_niov, payload_kiov, - payload_offset, payload_nob); - } - - rc = ksocknal_launch_packet(<x->ltx_tx, nid); - if (rc == 0) - return (PTL_OK); - - ksocknal_free_ltx(ltx); - return (PTL_FAIL); -} - -ptl_err_t -ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) -{ - return (ksocknal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} - -ptl_err_t -ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (ksocknal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); -} - -void -ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - ptl_nid_t nid = fwd->kprfd_gateway_nid; - ksock_ftx_t *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch; - int rc; - - CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, - fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); - - /* I'm the gateway; must be the last hop */ - if (nid == ksocknal_lib.libnal_ni.ni_pid.nid) - nid = fwd->kprfd_target_nid; - - /* setup iov for hdr */ - ftx->ftx_iov.iov_base = fwd->kprfd_hdr; - ftx->ftx_iov.iov_len = sizeof(ptl_hdr_t); - - ftx->ftx_tx.tx_isfwd = 1; /* This is a forwarding packet */ - ftx->ftx_tx.tx_nob = sizeof(ptl_hdr_t) + fwd->kprfd_nob; - ftx->ftx_tx.tx_niov = 1; - ftx->ftx_tx.tx_iov = &ftx->ftx_iov; - ftx->ftx_tx.tx_nkiov = fwd->kprfd_niov; - ftx->ftx_tx.tx_kiov = fwd->kprfd_kiov; - - rc = ksocknal_launch_packet (&ftx->ftx_tx, nid); - if (rc != 0) - kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, rc); -} - -int -ksocknal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); - unsigned long flags; - - if (pid < 0) - return ((int)pid); - - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); - ksocknal_data.ksnd_nthreads++; - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); - return (0); -} - -void -ksocknal_thread_fini (void) -{ - unsigned long flags; - - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); - ksocknal_data.ksnd_nthreads--; - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); -} - -void -ksocknal_fmb_callback (void *arg, int error) -{ - ksock_fmb_t *fmb = (ksock_fmb_t *)arg; - ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = &fmb->fmb_hdr; - ksock_conn_t *conn = NULL; - ksock_sched_t *sched; - unsigned long flags; - char ipbuf[PTL_NALFMT_SIZE]; - char ipbuf2[PTL_NALFMT_SIZE]; - - if (error != 0) - CERROR("Failed to route packet from " - LPX64" %s to "LPX64" %s: %d\n", - le64_to_cpu(hdr->src_nid), - portals_nid2str(SOCKNAL, le64_to_cpu(hdr->src_nid), ipbuf), - le64_to_cpu(hdr->dest_nid), - portals_nid2str(SOCKNAL, le64_to_cpu(hdr->dest_nid), ipbuf2), - error); - else - CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", - le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid)); - - /* drop peer ref taken on init */ - ksocknal_put_peer (fmb->fmb_peer); - - spin_lock_irqsave (&fmp->fmp_lock, flags); - - list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); - fmp->fmp_nactive_fmbs--; - - if (!list_empty (&fmp->fmp_blocked_conns)) { - conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, - ksock_conn_t, ksnc_rx_list); - list_del (&conn->ksnc_rx_list); - } - - spin_unlock_irqrestore (&fmp->fmp_lock, flags); - - if (conn == NULL) - return; - - CDEBUG (D_NET, "Scheduling conn %p\n", conn); - LASSERT (conn->ksnc_rx_scheduled); - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); - - conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; - - sched = conn->ksnc_scheduler; - - spin_lock_irqsave (&sched->kss_lock, flags); - - list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); - wake_up (&sched->kss_waitq); - - spin_unlock_irqrestore (&sched->kss_lock, flags); -} - -ksock_fmb_t * -ksocknal_get_idle_fmb (ksock_conn_t *conn) -{ - int payload_nob = conn->ksnc_rx_nob_left; - unsigned long flags; - ksock_fmb_pool_t *pool; - ksock_fmb_t *fmb; - - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); - LASSERT (kpr_routing(&ksocknal_data.ksnd_router)); - - if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) - pool = &ksocknal_data.ksnd_small_fmp; - else - pool = &ksocknal_data.ksnd_large_fmp; - - spin_lock_irqsave (&pool->fmp_lock, flags); - - if (!list_empty (&pool->fmp_idle_fmbs)) { - fmb = list_entry(pool->fmp_idle_fmbs.next, - ksock_fmb_t, fmb_list); - list_del (&fmb->fmb_list); - pool->fmp_nactive_fmbs++; - spin_unlock_irqrestore (&pool->fmp_lock, flags); - - return (fmb); - } - - /* deschedule until fmb free */ - - conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; - - list_add_tail (&conn->ksnc_rx_list, - &pool->fmp_blocked_conns); - - spin_unlock_irqrestore (&pool->fmp_lock, flags); - return (NULL); -} - -int -ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) -{ - int payload_nob = conn->ksnc_rx_nob_left; - ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); - int niov = 0; - int nob = payload_nob; - - LASSERT (conn->ksnc_rx_scheduled); - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); - LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); - LASSERT (payload_nob >= 0); - LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * PAGE_SIZE); - LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); - LASSERT (fmb->fmb_kiov[0].kiov_offset == 0); - - /* Take a ref on the conn's peer to prevent module unload before - * forwarding completes. */ - fmb->fmb_peer = conn->ksnc_peer; - atomic_inc (&conn->ksnc_peer->ksnp_refcount); - - /* Copy the header we just read into the forwarding buffer. If - * there's payload, start reading reading it into the buffer, - * otherwise the forwarding buffer can be kicked off - * immediately. */ - fmb->fmb_hdr = conn->ksnc_hdr; - - while (nob > 0) { - LASSERT (niov < fmb->fmb_pool->fmp_buff_pages); - LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0); - fmb->fmb_kiov[niov].kiov_len = MIN (PAGE_SIZE, nob); - nob -= PAGE_SIZE; - niov++; - } - - kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr, - payload_nob, niov, fmb->fmb_kiov, - ksocknal_fmb_callback, fmb); - - if (payload_nob == 0) { /* got complete packet already */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n", - conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid); - - kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); - - ksocknal_new_packet (conn, 0); /* on to next packet */ - return (1); - } - - conn->ksnc_cookie = fmb; /* stash fmb for later */ - conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - - /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed - * buffer */ - LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t)); - - conn->ksnc_rx_niov = 0; - conn->ksnc_rx_nkiov = niov; - conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); - - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, - le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob); - return (0); -} - -void -ksocknal_fwd_parse (ksock_conn_t *conn) -{ - ksock_peer_t *peer; - ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); - ptl_nid_t src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid); - int body_len = le32_to_cpu(conn->ksnc_hdr.payload_length); - char str[PTL_NALFMT_SIZE]; - char str2[PTL_NALFMT_SIZE]; - - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, - src_nid, dest_nid, conn->ksnc_rx_nob_left); - - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); - LASSERT (conn->ksnc_rx_scheduled); - - if (body_len < 0) { /* length corrupt (overflow) */ - CERROR("dropping packet from "LPX64" (%s) for "LPX64" (%s): " - "packet size %d illegal\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2), - body_len); - - ksocknal_new_packet (conn, 0); /* on to new packet */ - return; - } - - if (!kpr_routing(&ksocknal_data.ksnd_router)) { /* not forwarding */ - CERROR("dropping packet from "LPX64" (%s) for "LPX64 - " (%s): not forwarding\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2)); - /* on to new packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - if (body_len > PTL_MTU) { /* too big to forward */ - CERROR ("dropping packet from "LPX64" (%s) for "LPX64 - "(%s): packet size %d too big\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2), - body_len); - /* on to new packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - /* should have gone direct */ - peer = ksocknal_get_peer (conn->ksnc_hdr.dest_nid); - if (peer != NULL) { - CERROR ("dropping packet from "LPX64" (%s) for "LPX64 - "(%s): target is a peer\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2)); - ksocknal_put_peer (peer); /* drop ref from get above */ - - /* on to next packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ - conn->ksnc_rx_nob_left = body_len; /* stash packet size */ - conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ -} - -int -ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) -{ - static char ksocknal_slop_buffer[4096]; - - int nob; - int niov; - int skipped; - - if (nob_to_skip == 0) { /* right at next packet boundary now */ - conn->ksnc_rx_started = 0; - mb (); /* racing with timeout thread */ - - conn->ksnc_rx_state = SOCKNAL_RX_HEADER; - conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); - conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); - - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; - conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); - conn->ksnc_rx_niov = 1; - - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_nkiov = 0; - return (1); - } - - /* Set up to skip as much a possible now. If there's more left - * (ran out of iov entries) we'll get called again */ - - conn->ksnc_rx_state = SOCKNAL_RX_SLOP; - conn->ksnc_rx_nob_left = nob_to_skip; - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - skipped = 0; - niov = 0; - - do { - nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); - - conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; - conn->ksnc_rx_iov[niov].iov_len = nob; - niov++; - skipped += nob; - nob_to_skip -=nob; - - } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ - niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); - - conn->ksnc_rx_niov = niov; - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_nkiov = 0; - conn->ksnc_rx_nob_wanted = skipped; - return (0); -} - -int -ksocknal_process_receive (ksock_conn_t *conn) -{ - ksock_fmb_t *fmb; - int rc; - - LASSERT (atomic_read (&conn->ksnc_refcount) > 0); - - /* doesn't need a forwarding buffer */ - if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) - goto try_read; - - get_fmb: - fmb = ksocknal_get_idle_fmb (conn); - if (fmb == NULL) { - /* conn descheduled waiting for idle fmb */ - return (0); - } - - if (ksocknal_init_fmb (conn, fmb)) { - /* packet forwarded */ - return (0); - } - - try_read: - /* NB: sched lock NOT held */ - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_BODY || - conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || - conn->ksnc_rx_state == SOCKNAL_RX_SLOP); - - LASSERT (conn->ksnc_rx_nob_wanted > 0); - - rc = ksocknal_receive(conn); - - if (rc <= 0) { - LASSERT (rc != -EAGAIN); - - if (rc == 0) - CWARN ("[%p] EOF from "LPX64" ip %d.%d.%d.%d:%d\n", - conn, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - else if (!conn->ksnc_closing) - CERROR ("[%p] Error %d on read from "LPX64 - " ip %d.%d.%d.%d:%d\n", - conn, rc, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - - ksocknal_close_conn_and_siblings (conn, rc); - return (rc == 0 ? -ESHUTDOWN : rc); - } - - if (conn->ksnc_rx_nob_wanted != 0) { - /* short read */ - return (-EAGAIN); - } - - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_HEADER: - if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) && - le64_to_cpu(conn->ksnc_hdr.dest_nid) != - ksocknal_lib.libnal_ni.ni_pid.nid) { - /* This packet isn't for me */ - ksocknal_fwd_parse (conn); - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ - return (0); /* => come back later */ - case SOCKNAL_RX_SLOP: /* skipping packet's body */ - goto try_read; /* => go read it */ - case SOCKNAL_RX_GET_FMB: /* forwarding */ - goto get_fmb; /* => go get a fwd msg buffer */ - default: - LBUG (); - } - /* Not Reached */ - } - - /* sets wanted_len, iovs etc */ - rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - - if (rc != PTL_OK) { - /* I just received garbage: give up on this conn */ - ksocknal_close_conn_and_siblings (conn, rc); - return (-EPROTO); - } - - if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ - conn->ksnc_rx_state = SOCKNAL_RX_BODY; - goto try_read; /* go read the payload */ - } - /* Fall through (completed packet for me) */ - - case SOCKNAL_RX_BODY: - /* payload all received */ - lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK); - /* Fall through */ - - case SOCKNAL_RX_SLOP: - /* starting new packet? */ - if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) - return (0); /* come back later */ - goto try_read; /* try to finish reading slop now */ - - case SOCKNAL_RX_BODY_FWD: - /* payload all received */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", - conn, le64_to_cpu(conn->ksnc_hdr.src_nid), - le64_to_cpu(conn->ksnc_hdr.dest_nid), - conn->ksnc_rx_nob_left); - - /* forward the packet. NB ksocknal_init_fmb() put fmb into - * conn->ksnc_cookie */ - fmb = (ksock_fmb_t *)conn->ksnc_cookie; - kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); - - /* no slop in forwarded packets */ - LASSERT (conn->ksnc_rx_nob_left == 0); - - ksocknal_new_packet (conn, 0); /* on to next packet */ - return (0); /* (later) */ - - default: - break; - } - - /* Not Reached */ - LBUG (); - return (-EINVAL); /* keep gcc happy */ -} - -ptl_err_t -ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - ksock_conn_t *conn = (ksock_conn_t *)private; - - LASSERT (mlen <= rlen); - LASSERT (niov <= PTL_MD_MAX_IOV); - - conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_wanted = mlen; - conn->ksnc_rx_nob_left = rlen; - - conn->ksnc_rx_nkiov = 0; - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; - conn->ksnc_rx_niov = - lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov, - niov, iov, offset, mlen); - - LASSERT (mlen == - lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - - return (PTL_OK); -} - -ptl_err_t -ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - ksock_conn_t *conn = (ksock_conn_t *)private; - - LASSERT (mlen <= rlen); - LASSERT (niov <= PTL_MD_MAX_IOV); - - conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_wanted = mlen; - conn->ksnc_rx_nob_left = rlen; - - conn->ksnc_rx_niov = 0; - conn->ksnc_rx_iov = NULL; - conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - conn->ksnc_rx_nkiov = - lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov, - niov, kiov, offset, mlen); - - LASSERT (mlen == - lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - - return (PTL_OK); -} - -static inline int -ksocknal_sched_cansleep(ksock_sched_t *sched) -{ - unsigned long flags; - int rc; - - spin_lock_irqsave(&sched->kss_lock, flags); - - rc = (!ksocknal_data.ksnd_shuttingdown && -#if SOCKNAL_ZC - list_empty(&sched->kss_zctxdone_list) && -#endif - list_empty(&sched->kss_rx_conns) && - list_empty(&sched->kss_tx_conns)); - - spin_unlock_irqrestore(&sched->kss_lock, flags); - return (rc); -} - -int ksocknal_scheduler (void *arg) -{ - ksock_sched_t *sched = (ksock_sched_t *)arg; - ksock_conn_t *conn; - ksock_tx_t *tx; - unsigned long flags; - int rc; - int nloops = 0; - int id = sched - ksocknal_data.ksnd_schedulers; - char name[16]; - - snprintf (name, sizeof (name),"ksocknald_%02d", id); - kportal_daemonize (name); - kportal_blockallsigs (); - -#if (CONFIG_SMP && CPU_AFFINITY) - id = ksocknal_sched2cpu(id); - if (cpu_online(id)) { - cpumask_t m; - cpu_set(id, m); - set_cpus_allowed(current, m); - } else { - CERROR ("Can't set CPU affinity for %s to %d\n", name, id); - } -#endif /* CONFIG_SMP && CPU_AFFINITY */ - - spin_lock_irqsave (&sched->kss_lock, flags); - - while (!ksocknal_data.ksnd_shuttingdown) { - int did_something = 0; - - /* Ensure I progress everything semi-fairly */ - - if (!list_empty (&sched->kss_rx_conns)) { - conn = list_entry(sched->kss_rx_conns.next, - ksock_conn_t, ksnc_rx_list); - list_del(&conn->ksnc_rx_list); - - LASSERT(conn->ksnc_rx_scheduled); - LASSERT(conn->ksnc_rx_ready); - - /* clear rx_ready in case receive isn't complete. - * Do it BEFORE we call process_recv, since - * data_ready can set it any time after we release - * kss_lock. */ - conn->ksnc_rx_ready = 0; - spin_unlock_irqrestore(&sched->kss_lock, flags); - - rc = ksocknal_process_receive(conn); - - spin_lock_irqsave(&sched->kss_lock, flags); - - /* I'm the only one that can clear this flag */ - LASSERT(conn->ksnc_rx_scheduled); - - /* Did process_receive get everything it wanted? */ - if (rc == 0) - conn->ksnc_rx_ready = 1; - - if (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP || - conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB) { - /* Conn blocked for a forwarding buffer. - * It will get queued for my attention when - * one becomes available (and it might just - * already have been!). Meanwhile my ref - * on it stays put. */ - } else if (conn->ksnc_rx_ready) { - /* reschedule for rx */ - list_add_tail (&conn->ksnc_rx_list, - &sched->kss_rx_conns); - } else { - conn->ksnc_rx_scheduled = 0; - /* drop my ref */ - ksocknal_put_conn(conn); - } - - did_something = 1; - } - - if (!list_empty (&sched->kss_tx_conns)) { - conn = list_entry(sched->kss_tx_conns.next, - ksock_conn_t, ksnc_tx_list); - list_del (&conn->ksnc_tx_list); - - LASSERT(conn->ksnc_tx_scheduled); - LASSERT(conn->ksnc_tx_ready); - LASSERT(!list_empty(&conn->ksnc_tx_queue)); - - tx = list_entry(conn->ksnc_tx_queue.next, - ksock_tx_t, tx_list); - /* dequeue now so empty list => more to send */ - list_del(&tx->tx_list); - - /* Clear tx_ready in case send isn't complete. Do - * it BEFORE we call process_transmit, since - * write_space can set it any time after we release - * kss_lock. */ - conn->ksnc_tx_ready = 0; - spin_unlock_irqrestore (&sched->kss_lock, flags); - - rc = ksocknal_process_transmit(conn, tx); - - spin_lock_irqsave (&sched->kss_lock, flags); - - if (rc == -ENOMEM || rc == -EAGAIN) { - /* Incomplete send: replace tx on HEAD of tx_queue */ - list_add (&tx->tx_list, &conn->ksnc_tx_queue); - } else { - /* Complete send; assume space for more */ - conn->ksnc_tx_ready = 1; - } - - if (rc == -ENOMEM) { - /* Do nothing; after a short timeout, this - * conn will be reposted on kss_tx_conns. */ - } else if (conn->ksnc_tx_ready && - !list_empty (&conn->ksnc_tx_queue)) { - /* reschedule for tx */ - list_add_tail (&conn->ksnc_tx_list, - &sched->kss_tx_conns); - } else { - conn->ksnc_tx_scheduled = 0; - /* drop my ref */ - ksocknal_put_conn (conn); - } - - did_something = 1; - } -#if SOCKNAL_ZC - if (!list_empty (&sched->kss_zctxdone_list)) { - ksock_tx_t *tx = - list_entry(sched->kss_zctxdone_list.next, - ksock_tx_t, tx_list); - did_something = 1; - - list_del (&tx->tx_list); - spin_unlock_irqrestore (&sched->kss_lock, flags); - - ksocknal_tx_done (tx, 1); - - spin_lock_irqsave (&sched->kss_lock, flags); - } -#endif - if (!did_something || /* nothing to do */ - ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ - spin_unlock_irqrestore (&sched->kss_lock, flags); - - nloops = 0; - - if (!did_something) { /* wait for something to do */ - rc = wait_event_interruptible (sched->kss_waitq, - !ksocknal_sched_cansleep(sched)); - LASSERT (rc == 0); - } else - our_cond_resched(); - - spin_lock_irqsave (&sched->kss_lock, flags); - } - } - - spin_unlock_irqrestore (&sched->kss_lock, flags); - ksocknal_thread_fini (); - return (0); -} - -void -ksocknal_data_ready (struct sock *sk, int n) -{ - unsigned long flags; - ksock_conn_t *conn; - ksock_sched_t *sched; - ENTRY; - - /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - if (conn == NULL) { /* raced with ksocknal_terminate_conn */ - LASSERT (sk->sk_data_ready != &ksocknal_data_ready); - sk->sk_data_ready (sk, n); - } else { - sched = conn->ksnc_scheduler; - - spin_lock_irqsave (&sched->kss_lock, flags); - - conn->ksnc_rx_ready = 1; - - if (!conn->ksnc_rx_scheduled) { /* not being progressed */ - list_add_tail(&conn->ksnc_rx_list, - &sched->kss_rx_conns); - conn->ksnc_rx_scheduled = 1; - /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); - - wake_up (&sched->kss_waitq); - } - - spin_unlock_irqrestore (&sched->kss_lock, flags); - } - - read_unlock (&ksocknal_data.ksnd_global_lock); - - EXIT; -} - -void -ksocknal_write_space (struct sock *sk) -{ - unsigned long flags; - ksock_conn_t *conn; - ksock_sched_t *sched; - - /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - - CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", - sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, - (conn == NULL) ? "" : (conn->ksnc_tx_ready ? - " ready" : " blocked"), - (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? - " scheduled" : " idle"), - (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? - " empty" : " queued")); - - if (conn == NULL) { /* raced with ksocknal_terminate_conn */ - LASSERT (sk->sk_write_space != &ksocknal_write_space); - sk->sk_write_space (sk); - - read_unlock (&ksocknal_data.ksnd_global_lock); - return; - } - - if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ - sched = conn->ksnc_scheduler; - - spin_lock_irqsave (&sched->kss_lock, flags); - - clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); - conn->ksnc_tx_ready = 1; - - if (!conn->ksnc_tx_scheduled && // not being progressed - !list_empty(&conn->ksnc_tx_queue)){//packets to send - list_add_tail (&conn->ksnc_tx_list, - &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); - - wake_up (&sched->kss_waitq); - } - - spin_unlock_irqrestore (&sched->kss_lock, flags); - } - - read_unlock (&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - set_fs (KERNEL_DS); - rc = sock_sendmsg (sock, &msg, iov.iov_len); - set_fs (oldmm); - - if (rc < 0) - return (rc); - - if (rc == 0) { - CERROR ("Unexpected zero rc\n"); - return (-ECONNABORTED); - } - - buffer = ((char *)buffer) + rc; - nob -= rc; - } - - return (0); -} - -int -ksocknal_sock_read (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - set_fs (KERNEL_DS); - rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); - set_fs (oldmm); - - if (rc < 0) - return (rc); - - if (rc == 0) - return (-ECONNABORTED); - - buffer = ((char *)buffer) + rc; - nob -= rc; - } - - return (0); -} - -int -ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs) -{ - /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ - struct socket *sock = conn->ksnc_sock; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; - int i; - int rc; - - LASSERT (conn->ksnc_type != SOCKNAL_CONN_NONE); - LASSERT (nipaddrs <= SOCKNAL_MAX_INTERFACES); - - /* No need for getconnsock/putconnsock */ - LASSERT (!conn->ksnc_closing); - - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - hmv->magic = cpu_to_le32 (PORTALS_PROTO_MAGIC); - hmv->version_major = cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); - - hdr.src_nid = cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid); - hdr.type = cpu_to_le32 (PTL_MSG_HELLO); - hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs)); - - hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type); - hdr.msg.hello.incarnation = - cpu_to_le64 (ksocknal_data.ksnd_incarnation); - - /* Receiver is eager */ - rc = ksocknal_sock_write (sock, &hdr, sizeof(hdr)); - if (rc != 0) { - CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", - rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - return (rc); - } - - if (nipaddrs == 0) - return (0); - - for (i = 0; i < nipaddrs; i++) { - ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]); - } - - rc = ksocknal_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs)); - if (rc != 0) - CERROR ("Error %d sending HELLO payload (%d)" - " to %u.%u.%u.%u/%d\n", rc, nipaddrs, - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - return (rc); -} - -int -ksocknal_invert_type(int type) -{ - switch (type) - { - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - return (type); - case SOCKNAL_CONN_BULK_IN: - return SOCKNAL_CONN_BULK_OUT; - case SOCKNAL_CONN_BULK_OUT: - return SOCKNAL_CONN_BULK_IN; - default: - return (SOCKNAL_CONN_NONE); - } -} - -int -ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, - __u64 *incarnation, __u32 *ipaddrs) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - int nips; - int i; - int type; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv; - - hmv = (ptl_magicversion_t *)&hdr.dest_nid; - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - - rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv)); - if (rc != 0) { - CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", - rc, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); - } - - if (hmv->magic != le32_to_cpu (PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - - if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { - CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from %u.%u.%u.%u\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - PORTALS_PROTO_VERSION_MAJOR, - PORTALS_PROTO_VERSION_MINOR, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - -#if (PORTALS_PROTO_VERSION_MAJOR != 1) -# error "This code only understands protocol version 1.x" -#endif - /* version 1 sends magic/version as the dest_nid of a 'hello' - * header, followed by payload full of interface IP addresses. - * Read the rest of it in now... */ - - rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); - if (rc != 0) { - CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n", - rc, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); - } - - /* ...and check we got what we expected */ - if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { - CERROR ("Expecting a HELLO hdr," - " but got type %d from %u.%u.%u.%u\n", - le32_to_cpu (hdr.type), - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - - if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY" - "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - - if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = le64_to_cpu(hdr.src_nid); - } else if (*nid != le64_to_cpu (hdr.src_nid)) { - CERROR ("Connected to nid "LPX64"@%u.%u.%u.%u " - "but expecting "LPX64"\n", - le64_to_cpu (hdr.src_nid), - HIPQUAD(conn->ksnc_ipaddr), *nid); - return (-EPROTO); - } - - type = __le32_to_cpu(hdr.msg.hello.type); - - if (conn->ksnc_type == SOCKNAL_CONN_NONE) { - /* I've accepted this connection; peer determines type */ - conn->ksnc_type = ksocknal_invert_type(type); - if (conn->ksnc_type == SOCKNAL_CONN_NONE) { - CERROR ("Unexpected type %d from "LPX64"@%u.%u.%u.%u\n", - type, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - } else if (ksocknal_invert_type(type) != conn->ksnc_type) { - CERROR ("Mismatched types: me %d, "LPX64"@%u.%u.%u.%u %d\n", - conn->ksnc_type, *nid, HIPQUAD(conn->ksnc_ipaddr), - le32_to_cpu(hdr.msg.hello.type)); - return (-EPROTO); - } - - *incarnation = le64_to_cpu(hdr.msg.hello.incarnation); - - nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32); - - if (nips > SOCKNAL_MAX_INTERFACES || - nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) { - CERROR("Bad payload length %d from "LPX64"@%u.%u.%u.%u\n", - __le32_to_cpu (hdr.payload_length), - *nid, HIPQUAD(conn->ksnc_ipaddr)); - } - - if (nips == 0) - return (0); - - rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs)); - if (rc != 0) { - CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n", - rc, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); - } - - for (i = 0; i < nips; i++) { - ipaddrs[i] = __le32_to_cpu(ipaddrs[i]); - - if (ipaddrs[i] == 0) { - CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n", - i, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } - } - - return (nips); -} - -int -ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) -{ - mm_segment_t oldmm = get_fs (); - struct socket *sock = conn->ksnc_sock; - int len; - int rc; - - rc = ksocknal_getconnsock (conn); - if (rc != 0) { - LASSERT (conn->ksnc_closing); - *txmem = *rxmem = *nagle = 0; - return (-ESHUTDOWN); - } - - set_fs (KERNEL_DS); - - len = sizeof(*txmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)txmem, &len); - if (rc == 0) { - len = sizeof(*rxmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)rxmem, &len); - } - if (rc == 0) { - len = sizeof(*nagle); - rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)nagle, &len); - } - - set_fs (oldmm); - ksocknal_putconnsock (conn); - - if (rc == 0) - *nagle = !*nagle; - else - *txmem = *rxmem = *nagle = 0; - - return (rc); -} - -int -ksocknal_setup_sock (struct socket *sock) -{ - mm_segment_t oldmm = get_fs (); - int rc; - int option; - int keep_idle; - int keep_intvl; - int keep_count; - int do_keepalive; - struct linger linger; - - sock->sk->sk_allocation = GFP_NOFS; - - /* Ensure this socket aborts active sends immediately when we close - * it. */ - - linger.l_onoff = 0; - linger.l_linger = 0; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER, - (char *)&linger, sizeof (linger)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set SO_LINGER: %d\n", rc); - return (rc); - } - - option = -1; - set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set SO_LINGER2: %d\n", rc); - return (rc); - } - - if (!ksocknal_tunables.ksnd_nagle) { - option = 1; - - set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't disable nagle: %d\n", rc); - return (rc); - } - } - - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = ksocknal_tunables.ksnd_buffer_size; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", - option, rc); - return (rc); - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", - option, rc); - return (rc); - } - } - - /* snapshot tunables */ - keep_idle = ksocknal_tunables.ksnd_keepalive_idle; - keep_count = ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; - - do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); - - option = (do_keepalive ? 1 : 0); - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); - return (rc); - } - - if (!do_keepalive) - return (0); - - set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keep_idle, sizeof (keep_idle)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); - return (rc); - } - - set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keep_intvl, sizeof (keep_intvl)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); - return (rc); - } - - set_fs (KERNEL_DS); - rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keep_count, sizeof (keep_count)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set TCP_KEEPCNT: %d\n", rc); - return (rc); - } - - return (0); -} - -static int -ksocknal_connect_sock(struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = - (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) - : INADDR_ANY; - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - *may_retry = 0; - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc != 0) { - CERROR ("Can't create autoconnect socket: %d\n", rc); - return (rc); - } - - /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the sock->file refcounting - * that this gives you :) */ - - rc = sock_map_fd (sock); - if (rc < 0) { - sock_release (sock); - CERROR ("sock_map_fd error %d\n", rc); - return (rc); - } - - /* NB the file descriptor (rc) now owns the ref on sock->file */ - LASSERT (sock->file != NULL); - LASSERT (file_count(sock->file) == 1); - - get_file(sock->file); /* extra ref makes sock->file */ - sys_close(rc); /* survive this close */ - - /* Still got a single ref on sock->file */ - LASSERT (file_count(sock->file) == 1); - - /* Set the socket timeouts, so our connection attempt completes in - * finite time */ - tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; - tv.tv_usec = 0; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - option = 1; - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; - } - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *may_retry = 1; - goto failed; - } - if (rc != 0) { - CERROR("Error trying to bind to reserved port %d: %d\n", - local_port, rc); - goto failed; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - sock->file->f_flags); - if (rc == 0) - return 0; - - /* EADDRNOTAVAIL probably means we're already connected to the same - * peer/port on the same local port on a differently typed - * connection. Let our caller retry with a different local - * port... */ - *may_retry = (rc == -EADDRNOTAVAIL); - - CDEBUG(*may_retry ? D_NET : D_ERROR, - "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, - HIPQUAD(route->ksnr_myipaddr), local_port, - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - - failed: - fput(sock->file); - return rc; -} - -int -ksocknal_connect_peer (ksock_route_t *route, int type) -{ - struct socket *sock; - int rc; - int port; - int may_retry; - - /* Iterate through reserved ports. When typed connections are - * used, we will need to bind to multiple ports, but we only know - * this at connect time. But, by that time we've already called - * bind() so we need a new socket. */ - - for (port = 1023; port > 512; --port) { - - rc = ksocknal_connect_sock(&sock, &may_retry, route, port); - - if (rc == 0) { - rc = ksocknal_create_conn(route, sock, type); - fput(sock->file); - return rc; - } - - if (!may_retry) - return rc; - } - - CERROR("Out of ports trying to bind to a reserved port\n"); - return (-EADDRINUSE); -} - -void -ksocknal_autoconnect (ksock_route_t *route) -{ - LIST_HEAD (zombies); - ksock_tx_t *tx; - ksock_peer_t *peer; - unsigned long flags; - int rc; - int type; - - for (;;) { - for (type = 0; type < SOCKNAL_CONN_NTYPES; type++) - if ((route->ksnr_connecting & (1 << type)) != 0) - break; - LASSERT (type < SOCKNAL_CONN_NTYPES); - - rc = ksocknal_connect_peer (route, type); - if (rc != 0) - break; - - /* successfully autoconnected: create_conn did the - * route/conn binding and scheduled any blocked packets */ - - if (route->ksnr_connecting == 0) { - /* No more connections required */ - return; - } - } - - /* Connection attempt failed */ - - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); - - peer = route->ksnr_peer; - route->ksnr_connecting = 0; - - /* This is a retry rather than a new connection */ - LASSERT (route->ksnr_retry_interval != 0); - route->ksnr_timeout = jiffies + route->ksnr_retry_interval; - route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, - SOCKNAL_MAX_RECONNECT_INTERVAL); - - if (!list_empty (&peer->ksnp_tx_queue) && - ksocknal_find_connecting_route_locked (peer) == NULL) { - LASSERT (list_empty (&peer->ksnp_conns)); - - /* None of the connections that the blocked packets are - * waiting for have been successful. Complete them now... */ - do { - tx = list_entry (peer->ksnp_tx_queue.next, - ksock_tx_t, tx_list); - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } while (!list_empty (&peer->ksnp_tx_queue)); - } - -#if 0 /* irrelevent with only eager routes */ - if (!route->ksnr_deleted) { - /* make this route least-favourite for re-selection */ - list_del(&route->ksnr_list); - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - } -#endif - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - while (!list_empty (&zombies)) { - char ipbuf[PTL_NALFMT_SIZE]; - char ipbuf2[PTL_NALFMT_SIZE]; - tx = list_entry (zombies.next, ksock_tx_t, tx_list); - - CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n", - le32_to_cpu (tx->tx_hdr->type), - le32_to_cpu (tx->tx_hdr->payload_length), - le64_to_cpu (tx->tx_hdr->src_nid), - portals_nid2str(SOCKNAL, - le64_to_cpu(tx->tx_hdr->src_nid), - ipbuf), - le64_to_cpu (tx->tx_hdr->dest_nid), - portals_nid2str(SOCKNAL, - le64_to_cpu(tx->tx_hdr->src_nid), - ipbuf2)); - - list_del (&tx->tx_list); - /* complete now */ - ksocknal_tx_done (tx, 0); - } -} - -int -ksocknal_autoconnectd (void *arg) -{ - long id = (long)arg; - char name[16]; - unsigned long flags; - ksock_route_t *route; - int rc; - - snprintf (name, sizeof (name), "ksocknal_ad%02ld", id); - kportal_daemonize (name); - kportal_blockallsigs (); - - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); - - while (!ksocknal_data.ksnd_shuttingdown) { - - if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) { - route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next, - ksock_route_t, ksnr_connect_list); - - list_del (&route->ksnr_connect_list); - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); - - ksocknal_autoconnect (route); - ksocknal_put_route (route); - - spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, - flags); - continue; - } - - spin_unlock_irqrestore(&ksocknal_data.ksnd_autoconnectd_lock, - flags); - - rc = wait_event_interruptible(ksocknal_data.ksnd_autoconnectd_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&ksocknal_data.ksnd_autoconnectd_routes)); - - spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, flags); - } - - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); - - ksocknal_thread_fini (); - return (0); -} - -ksock_conn_t * -ksocknal_find_timed_out_conn (ksock_peer_t *peer) -{ - /* We're called with a shared lock on ksnd_global_lock */ - ksock_conn_t *conn; - struct list_head *ctmp; - - list_for_each (ctmp, &peer->ksnp_conns) { - conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - - /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT (!conn->ksnc_closing); - - if (conn->ksnc_sock->sk->sk_err != 0) { - /* Something (e.g. failed keepalive) set the socket error */ - atomic_inc (&conn->ksnc_refcount); - CERROR ("Socket error %d: "LPX64" %p %d.%d.%d.%d\n", - conn->ksnc_sock->sk->sk_err, peer->ksnp_nid, - conn, HIPQUAD(conn->ksnc_ipaddr)); - return (conn); - } - - if (conn->ksnc_rx_started && - time_after_eq (jiffies, conn->ksnc_rx_deadline)) { - /* Timed out incomplete incoming message */ - atomic_inc (&conn->ksnc_refcount); - CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n", - peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr)); - return (conn); - } - - if ((!list_empty (&conn->ksnc_tx_queue) || - conn->ksnc_sock->sk->sk_wmem_queued != 0) && - time_after_eq (jiffies, conn->ksnc_tx_deadline)) { - /* Timed out messages queued for sending or - * buffered in the socket's send buffer */ - atomic_inc (&conn->ksnc_refcount); - CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", - peer->ksnp_nid, - list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", - conn->ksnc_sock->sk->sk_wmem_queued, conn, - HIPQUAD(conn->ksnc_ipaddr)); - return (conn); - } - } - - return (NULL); -} - -void -ksocknal_check_peer_timeouts (int idx) -{ - struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; - struct list_head *ptmp; - ksock_peer_t *peer; - ksock_conn_t *conn; - - again: - /* NB. We expect to have a look at all the peers and not find any - * connections to time out, so we just use a shared lock while we - * take a look... */ - read_lock (&ksocknal_data.ksnd_global_lock); - - list_for_each (ptmp, peers) { - peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - conn = ksocknal_find_timed_out_conn (peer); - - if (conn != NULL) { - read_unlock (&ksocknal_data.ksnd_global_lock); - - CERROR ("Timeout out conn->"LPX64" ip %d.%d.%d.%d:%d\n", - peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); - - /* NB we won't find this one again, but we can't - * just proceed with the next peer, since we dropped - * ksnd_global_lock and it might be dead already! */ - ksocknal_put_conn (conn); - goto again; - } - } - - read_unlock (&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_reaper (void *arg) -{ - wait_queue_t wait; - unsigned long flags; - ksock_conn_t *conn; - ksock_sched_t *sched; - struct list_head enomem_conns; - int nenomem_conns; - int timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - kportal_daemonize ("ksocknal_reaper"); - kportal_blockallsigs (); - - INIT_LIST_HEAD(&enomem_conns); - init_waitqueue_entry (&wait, current); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - - while (!ksocknal_data.ksnd_shuttingdown) { - - if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { - conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next, - ksock_conn_t, ksnc_list); - list_del (&conn->ksnc_list); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - - ksocknal_terminate_conn (conn); - ksocknal_put_conn (conn); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - continue; - } - - if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { - conn = list_entry (ksocknal_data.ksnd_zombie_conns.next, - ksock_conn_t, ksnc_list); - list_del (&conn->ksnc_list); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - - ksocknal_destroy_conn (conn); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - continue; - } - - if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) { - list_add(&enomem_conns, &ksocknal_data.ksnd_enomem_conns); - list_del_init(&ksocknal_data.ksnd_enomem_conns); - } - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - - /* reschedule all the connections that stalled with ENOMEM... */ - nenomem_conns = 0; - while (!list_empty (&enomem_conns)) { - conn = list_entry (enomem_conns.next, - ksock_conn_t, ksnc_tx_list); - list_del (&conn->ksnc_tx_list); - - sched = conn->ksnc_scheduler; - - spin_lock_irqsave (&sched->kss_lock, flags); - - LASSERT (conn->ksnc_tx_scheduled); - conn->ksnc_tx_ready = 1; - list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); - wake_up (&sched->kss_waitq); - - spin_unlock_irqrestore (&sched->kss_lock, flags); - nenomem_conns++; - } - - /* careful with the jiffy wrap... */ - while ((timeout = (int)(deadline - jiffies)) <= 0) { - const int n = 4; - const int p = 1; - int chunk = ksocknal_data.ksnd_peer_hash_size; - - /* Time to check for timeouts on a few more peers: I do - * checks every 'p' seconds on a proportion of the peer - * table and I need to check every connection 'n' times - * within a timeout interval, to ensure I detect a - * timeout on any connection within (n+1)/n times the - * timeout interval. */ - - if (ksocknal_tunables.ksnd_io_timeout > n * p) - chunk = (chunk * n * p) / - ksocknal_tunables.ksnd_io_timeout; - if (chunk == 0) - chunk = 1; - - for (i = 0; i < chunk; i++) { - ksocknal_check_peer_timeouts (peer_index); - peer_index = (peer_index + 1) % - ksocknal_data.ksnd_peer_hash_size; - } - - deadline += p * HZ; - } - - if (nenomem_conns != 0) { - /* Reduce my timeout if I rescheduled ENOMEM conns. - * This also prevents me getting woken immediately - * if any go back on my enomem list. */ - timeout = SOCKNAL_ENOMEM_RETRY; - } - ksocknal_data.ksnd_reaper_waketime = jiffies + timeout; - - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); - - if (!ksocknal_data.ksnd_shuttingdown && - list_empty (&ksocknal_data.ksnd_deathrow_conns) && - list_empty (&ksocknal_data.ksnd_zombie_conns)) - schedule_timeout (timeout); - - set_current_state (TASK_RUNNING); - remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - } - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - - ksocknal_thread_fini (); - return (0); -} - -lib_nal_t ksocknal_lib = { - libnal_data: &ksocknal_data, /* NAL private data */ - libnal_send: ksocknal_send, - libnal_send_pages: ksocknal_send_pages, - libnal_recv: ksocknal_recv, - libnal_recv_pages: ksocknal_recv_pages, - libnal_dist: ksocknal_dist -}; diff --git a/lustre/portals/libcfs/.cvsignore b/lustre/portals/libcfs/.cvsignore deleted file mode 100644 index c6f0aa4..0000000 --- a/lustre/portals/libcfs/.cvsignore +++ /dev/null @@ -1,11 +0,0 @@ -.deps -Makefile -link-stamp -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/libcfs/Makefile.in b/lustre/portals/libcfs/Makefile.in deleted file mode 100644 index 15fff12..0000000 --- a/lustre/portals/libcfs/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES = libcfs -libcfs-objs := debug.o lwt.o module.o proc.o tracefile.o watchdog.o - -@INCLUDE_RULES@ diff --git a/lustre/portals/libcfs/Makefile.mk b/lustre/portals/libcfs/Makefile.mk deleted file mode 100644 index 8ecf3c9..0000000 --- a/lustre/portals/libcfs/Makefile.mk +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include fs/lustre/portals/Kernelenv - -obj-y += libcfs.o -libcfs-objs := module.o proc.o debug.o lwt.o tracefile.o diff --git a/lustre/portals/libcfs/autoMakefile.am b/lustre/portals/libcfs/autoMakefile.am deleted file mode 100644 index 9c27693..0000000 --- a/lustre/portals/libcfs/autoMakefile.am +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (C) 2001, 2002 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -modulenet_DATA := libcfs$(KMODEXT) -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(libcfs-objs:%.o=%.c) tracefile.h diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c deleted file mode 100644 index b5286fc..0000000 --- a/lustre/portals/libcfs/debug.c +++ /dev/null @@ -1,336 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -# define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include - -#include "tracefile.h" - -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -#include -#endif - -unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_NAL); -EXPORT_SYMBOL(portal_subsystem_debug); - -unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA | - D_RPCTRACE | D_VFSTRACE); -EXPORT_SYMBOL(portal_debug); - -unsigned int portal_printk; -EXPORT_SYMBOL(portal_printk); - -unsigned int portal_stack; -EXPORT_SYMBOL(portal_stack); - -#ifdef __KERNEL__ -atomic_t portal_kmemory = ATOMIC_INIT(0); -EXPORT_SYMBOL(portal_kmemory); -#endif - -static DECLARE_WAIT_QUEUE_HEAD(debug_ctlwq); - -char debug_file_path[1024] = "/tmp/lustre-log"; -static char debug_file_name[1024]; -static int handled_panic; /* to avoid recursive calls to notifiers */ -char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall"; - -void portals_debug_dumplog_internal(void *arg) -{ - void *journal_info = current->journal_info; - current->journal_info = NULL; - - snprintf(debug_file_name, sizeof(debug_file_path) - 1, - "%s.%ld.%ld", debug_file_path, CURRENT_SECONDS, (long)arg); - printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); - tracefile_dump_all_pages(debug_file_name); - - current->journal_info = journal_info; -} - -int portals_debug_dumplog_thread(void *arg) -{ - kportal_daemonize(""); - reparent_to_init(); - portals_debug_dumplog_internal(arg); - wake_up(&debug_ctlwq); - return 0; -} - -void portals_debug_dumplog(void) -{ - int rc; - DECLARE_WAITQUEUE(wait, current); - ENTRY; - - /* we're being careful to ensure that the kernel thread is - * able to set our state to running as it exits before we - * get to schedule() */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&debug_ctlwq, &wait); - - rc = kernel_thread(portals_debug_dumplog_thread, - (void *)(long)current->pid, - CLONE_VM | CLONE_FS | CLONE_FILES); - if (rc < 0) - printk(KERN_ERR "LustreError: cannot start log dump thread: " - "%d\n", rc); - else - schedule(); - - /* be sure to teardown if kernel_thread() failed */ - remove_wait_queue(&debug_ctlwq, &wait); - set_current_state(TASK_RUNNING); -} - -static int panic_dumplog(struct notifier_block *self, unsigned long unused1, - void *unused2) -{ - if (handled_panic) - return 0; - else - handled_panic = 1; - - if (in_interrupt()) { - trace_debug_print(); - return 0; - } - - while (current->lock_depth >= 0) - unlock_kernel(); - portals_debug_dumplog(); - return 0; -} - -static struct notifier_block lustre_panic_notifier = { - notifier_call : panic_dumplog, - next : NULL, - priority : 10000 -}; - -int portals_debug_init(unsigned long bufsize) -{ - notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); - return tracefile_init(); -} - -int portals_debug_cleanup(void) -{ - tracefile_exit(); - notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); - return 0; -} - -int portals_debug_clear_buffer(void) -{ - trace_flush_pages(); - return 0; -} - -/* Debug markers, although printed by S_PORTALS - * should not be be marked as such. */ -#undef DEBUG_SUBSYSTEM -#define DEBUG_SUBSYSTEM S_UNDEFINED -int portals_debug_mark_buffer(char *text) -{ - CDEBUG(D_TRACE,"***************************************************\n"); - CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text); - CDEBUG(D_TRACE,"***************************************************\n"); - - return 0; -} -#undef DEBUG_SUBSYSTEM -#define DEBUG_SUBSYSTEM S_PORTALS - -void portals_debug_set_level(unsigned int debug_level) -{ - printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n", - debug_level); - portal_debug = debug_level; -} - -void portals_run_upcall(char **argv) -{ - int rc; - int argc; - char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; - ENTRY; - - argv[0] = portals_upcall; - argc = 1; - while (argv[argc] != NULL) - argc++; - - LASSERT(argc >= 2); - - rc = USERMODEHELPER(argv[0], argv, envp); - if (rc < 0) { - CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; " - "check /proc/sys/portals/upcall\n", - rc, argv[0], argv[1], - argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], - argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], - argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], - argc < 6 ? "" : ",..."); - } else { - CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n", - argv[0], argv[1], - argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], - argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], - argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], - argc < 6 ? "" : ",..."); - } -} - -void portals_run_lbug_upcall(char *file, const char *fn, const int line) -{ - char *argv[6]; - char buf[32]; - - ENTRY; - snprintf (buf, sizeof buf, "%d", line); - - argv[1] = "LBUG"; - argv[2] = file; - argv[3] = (char *)fn; - argv[4] = buf; - argv[5] = NULL; - - portals_run_upcall (argv); -} - -char *portals_nid2str(int nal, ptl_nid_t nid, char *str) -{ - if (nid == PTL_NID_ANY) { - snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY"); - return str; - } - - switch(nal){ -/* XXX this could be a nal method of some sort, 'cept it's config - * dependent whether (say) socknal NIDs are actually IP addresses... */ -#if !CRAY_PORTALS - case TCPNAL: - /* userspace NAL */ - case IIBNAL: - case OPENIBNAL: - case RANAL: - case SOCKNAL: - snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", - (__u32)(nid >> 32), HIPQUAD(nid)); - break; - case QSWNAL: - case GMNAL: - case LONAL: - snprintf(str, PTL_NALFMT_SIZE, "%u:%u", - (__u32)(nid >> 32), (__u32)nid); - break; -#endif - default: - snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", - nal, (long long)nid); - break; - } - return str; -} - -char *portals_id2str(int nal, ptl_process_id_t id, char *str) -{ - int len; - - portals_nid2str(nal, id.nid, str); - len = strlen(str); - snprintf(str + len, PTL_NALFMT_SIZE - len, "-%u", id.pid); - return str; -} - -#ifdef __KERNEL__ - -void portals_debug_dumpstack(struct task_struct *tsk) -{ -#if defined(__arch_um__) - if (tsk != NULL) - CWARN("stack dump for pid %d (%d) requested; wake up gdb.\n", - tsk->pid, UML_PID(tsk)); - asm("int $3"); -#elif defined(HAVE_SHOW_TASK) - /* this is exported by lustre kernel version 42 */ - extern void show_task(struct task_struct *); - - if (tsk == NULL) - tsk = current; - CWARN("showing stack for process %d\n", tsk->pid); - show_task(tsk); -#else - CWARN("can't show stack: kernel doesn't export show_task\n"); -#endif -} - -struct task_struct *portals_current(void) -{ - CWARN("current task struct is %p\n", current); - return current; -} - -EXPORT_SYMBOL(portals_debug_dumpstack); -EXPORT_SYMBOL(portals_current); -#endif /* __KERNEL__ */ - -EXPORT_SYMBOL(portals_debug_dumplog); -EXPORT_SYMBOL(portals_debug_set_level); -EXPORT_SYMBOL(portals_run_upcall); -EXPORT_SYMBOL(portals_run_lbug_upcall); -EXPORT_SYMBOL(portals_nid2str); -EXPORT_SYMBOL(portals_id2str); diff --git a/lustre/portals/libcfs/lwt.c b/lustre/portals/libcfs/lwt.c deleted file mode 100644 index 3f6a9c2..0000000 --- a/lustre/portals/libcfs/lwt.c +++ /dev/null @@ -1,268 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2003 Cluster File Systems, Inc. - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_PORTALS - -#include - -#if LWT_SUPPORT - -#if !KLWT_SUPPORT -int lwt_enabled; -lwt_cpu_t lwt_cpus[NR_CPUS]; -#endif - -int lwt_pages_per_cpu; - -/* NB only root is allowed to retrieve LWT info; it's an open door into the - * kernel... */ - -int -lwt_lookup_string (int *size, char *knl_ptr, - char *user_ptr, int user_size) -{ - int maxsize = 128; - - /* knl_ptr was retrieved from an LWT snapshot and the caller wants to - * turn it into a string. NB we can crash with an access violation - * trying to determine the string length, so we're trusting our - * caller... */ - - if (!capable(CAP_SYS_ADMIN)) - return (-EPERM); - - if (user_size > 0 && - maxsize > user_size) - maxsize = user_size; - - *size = strnlen (knl_ptr, maxsize - 1) + 1; - - if (user_ptr != NULL) { - if (user_size < 4) - return (-EINVAL); - - if (copy_to_user (user_ptr, knl_ptr, *size)) - return (-EFAULT); - - /* Did I truncate the string? */ - if (knl_ptr[*size - 1] != 0) - copy_to_user (user_ptr + *size - 4, "...", 4); - } - - return (0); -} - -int -lwt_control (int enable, int clear) -{ - lwt_page_t *p; - int i; - int j; - - if (!capable(CAP_SYS_ADMIN)) - return (-EPERM); - - if (!enable) { - LWT_EVENT(0,0,0,0); - lwt_enabled = 0; - mb(); - /* give people some time to stop adding traces */ - schedule_timeout(10); - } - - for (i = 0; i < num_online_cpus(); i++) { - p = lwt_cpus[i].lwtc_current_page; - - if (p == NULL) - return (-ENODATA); - - if (!clear) - continue; - - for (j = 0; j < lwt_pages_per_cpu; j++) { - memset (p->lwtp_events, 0, PAGE_SIZE); - - p = list_entry (p->lwtp_list.next, - lwt_page_t, lwtp_list); - } - } - - if (enable) { - lwt_enabled = 1; - mb(); - LWT_EVENT(0,0,0,0); - } - - return (0); -} - -int -lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, - void *user_ptr, int user_size) -{ - const int events_per_page = PAGE_SIZE / sizeof(lwt_event_t); - const int bytes_per_page = events_per_page * sizeof(lwt_event_t); - lwt_page_t *p; - int i; - int j; - - if (!capable(CAP_SYS_ADMIN)) - return (-EPERM); - - *ncpu = num_online_cpus(); - *total_size = num_online_cpus() * lwt_pages_per_cpu * bytes_per_page; - *now = get_cycles(); - - if (user_ptr == NULL) - return (0); - - for (i = 0; i < num_online_cpus(); i++) { - p = lwt_cpus[i].lwtc_current_page; - - if (p == NULL) - return (-ENODATA); - - for (j = 0; j < lwt_pages_per_cpu; j++) { - if (copy_to_user(user_ptr, p->lwtp_events, - bytes_per_page)) - return (-EFAULT); - - user_ptr = ((char *)user_ptr) + bytes_per_page; - p = list_entry(p->lwtp_list.next, - lwt_page_t, lwtp_list); - - } - } - - return (0); -} - -int -lwt_init () -{ - int i; - int j; - - for (i = 0; i < num_online_cpus(); i++) - if (lwt_cpus[i].lwtc_current_page != NULL) - return (-EALREADY); - - LASSERT (!lwt_enabled); - - /* NULL pointers, zero scalars */ - memset (lwt_cpus, 0, sizeof (lwt_cpus)); - lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * PAGE_SIZE); - - for (i = 0; i < num_online_cpus(); i++) - for (j = 0; j < lwt_pages_per_cpu; j++) { - struct page *page = alloc_page (GFP_KERNEL); - lwt_page_t *lwtp; - - if (page == NULL) { - CERROR ("Can't allocate page\n"); - lwt_fini (); - return (-ENOMEM); - } - - PORTAL_ALLOC(lwtp, sizeof (*lwtp)); - if (lwtp == NULL) { - CERROR ("Can't allocate lwtp\n"); - __free_page(page); - lwt_fini (); - return (-ENOMEM); - } - - lwtp->lwtp_page = page; - lwtp->lwtp_events = page_address(page); - memset (lwtp->lwtp_events, 0, PAGE_SIZE); - - if (j == 0) { - INIT_LIST_HEAD (&lwtp->lwtp_list); - lwt_cpus[i].lwtc_current_page = lwtp; - } else { - list_add (&lwtp->lwtp_list, - &lwt_cpus[i].lwtc_current_page->lwtp_list); - } - } - - lwt_enabled = 1; - mb(); - - LWT_EVENT(0,0,0,0); - - return (0); -} - -void -lwt_fini () -{ - int i; - - lwt_control(0, 0); - - for (i = 0; i < num_online_cpus(); i++) - while (lwt_cpus[i].lwtc_current_page != NULL) { - lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page; - - if (list_empty (&lwtp->lwtp_list)) { - lwt_cpus[i].lwtc_current_page = NULL; - } else { - lwt_cpus[i].lwtc_current_page = - list_entry (lwtp->lwtp_list.next, - lwt_page_t, lwtp_list); - - list_del (&lwtp->lwtp_list); - } - - __free_page (lwtp->lwtp_page); - PORTAL_FREE (lwtp, sizeof (*lwtp)); - } -} - -EXPORT_SYMBOL(lwt_enabled); -EXPORT_SYMBOL(lwt_cpus); - -EXPORT_SYMBOL(lwt_init); -EXPORT_SYMBOL(lwt_fini); -EXPORT_SYMBOL(lwt_lookup_string); -EXPORT_SYMBOL(lwt_control); -EXPORT_SYMBOL(lwt_snapshot); -#endif diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c deleted file mode 100644 index 2a8e6f6..0000000 --- a/lustre/portals/libcfs/module.c +++ /dev/null @@ -1,608 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define PORTAL_MINOR 240 - -struct nal_cmd_handler { - int nch_number; - nal_cmd_handler_fn *nch_handler; - void *nch_private; -}; - -static struct nal_cmd_handler nal_cmd[16]; -static DECLARE_MUTEX(nal_cmd_sem); - -#ifdef PORTAL_DEBUG -void kportal_assertion_failed(char *expr, char *file, const char *func, - const int line) -{ - portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK, - "ASSERTION(%s) failed\n", expr); - LBUG_WITH_LOC(file, func, line); -} -#endif - -void -kportal_daemonize (char *str) -{ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) - daemonize(str); -#else - daemonize(); - snprintf (current->comm, sizeof (current->comm), "%s", str); -#endif -} - -void -kportal_memhog_free (struct portals_device_userstate *pdu) -{ - struct page **level0p = &pdu->pdu_memhog_root_page; - struct page **level1p; - struct page **level2p; - int count1; - int count2; - - if (*level0p != NULL) { - - level1p = (struct page **)page_address(*level0p); - count1 = 0; - - while (count1 < PAGE_SIZE/sizeof(struct page *) && - *level1p != NULL) { - - level2p = (struct page **)page_address(*level1p); - count2 = 0; - - while (count2 < PAGE_SIZE/sizeof(struct page *) && - *level2p != NULL) { - - __free_page(*level2p); - pdu->pdu_memhog_pages--; - level2p++; - count2++; - } - - __free_page(*level1p); - pdu->pdu_memhog_pages--; - level1p++; - count1++; - } - - __free_page(*level0p); - pdu->pdu_memhog_pages--; - - *level0p = NULL; - } - - LASSERT (pdu->pdu_memhog_pages == 0); -} - -int -kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flags) -{ - struct page **level0p; - struct page **level1p; - struct page **level2p; - int count1; - int count2; - - LASSERT (pdu->pdu_memhog_pages == 0); - LASSERT (pdu->pdu_memhog_root_page == NULL); - - if (npages < 0) - return -EINVAL; - - if (npages == 0) - return 0; - - level0p = &pdu->pdu_memhog_root_page; - *level0p = alloc_page(flags); - if (*level0p == NULL) - return -ENOMEM; - pdu->pdu_memhog_pages++; - - level1p = (struct page **)page_address(*level0p); - count1 = 0; - memset(level1p, 0, PAGE_SIZE); - - while (pdu->pdu_memhog_pages < npages && - count1 < PAGE_SIZE/sizeof(struct page *)) { - - if (signal_pending(current)) - return (-EINTR); - - *level1p = alloc_page(flags); - if (*level1p == NULL) - return -ENOMEM; - pdu->pdu_memhog_pages++; - - level2p = (struct page **)page_address(*level1p); - count2 = 0; - memset(level2p, 0, PAGE_SIZE); - - while (pdu->pdu_memhog_pages < npages && - count2 < PAGE_SIZE/sizeof(struct page *)) { - - if (signal_pending(current)) - return (-EINTR); - - *level2p = alloc_page(flags); - if (*level2p == NULL) - return (-ENOMEM); - pdu->pdu_memhog_pages++; - - level2p++; - count2++; - } - - level1p++; - count1++; - } - - return 0; -} - -void -kportal_blockallsigs () -{ - unsigned long flags; - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); -} - -/* called when opening /dev/device */ -static int libcfs_psdev_open(struct inode * inode, struct file * file) -{ - struct portals_device_userstate *pdu; - ENTRY; - - if (!inode) - RETURN(-EINVAL); - - PORTAL_MODULE_USE; - - PORTAL_ALLOC(pdu, sizeof(*pdu)); - if (pdu != NULL) { - pdu->pdu_memhog_pages = 0; - pdu->pdu_memhog_root_page = NULL; - } - file->private_data = pdu; - - RETURN(0); -} - -/* called when closing /dev/device */ -static int libcfs_psdev_release(struct inode * inode, struct file * file) -{ - struct portals_device_userstate *pdu; - ENTRY; - - if (!inode) - RETURN(-EINVAL); - - pdu = file->private_data; - if (pdu != NULL) { - kportal_memhog_free(pdu); - PORTAL_FREE(pdu, sizeof(*pdu)); - } - - PORTAL_MODULE_UNUSE; - RETURN(0); -} - -static inline void freedata(void *data, int len) -{ - PORTAL_FREE(data, len); -} - -struct nal_cmd_handler * -libcfs_find_nal_cmd_handler(int nal) -{ - int i; - - for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) - if (nal_cmd[i].nch_handler != NULL && - nal_cmd[i].nch_number == nal) - return (&nal_cmd[i]); - - return (NULL); -} - -int -libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private) -{ - struct nal_cmd_handler *cmd; - int i; - int rc; - - CDEBUG(D_IOCTL, "Register NAL %x, handler: %p\n", nal, handler); - - down(&nal_cmd_sem); - - if (libcfs_find_nal_cmd_handler(nal) != NULL) { - up (&nal_cmd_sem); - return (-EBUSY); - } - - cmd = NULL; - for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) - if (nal_cmd[i].nch_handler == NULL) { - cmd = &nal_cmd[i]; - break; - } - - if (cmd == NULL) { - rc = -EBUSY; - } else { - rc = 0; - cmd->nch_number = nal; - cmd->nch_handler = handler; - cmd->nch_private = private; - } - - up(&nal_cmd_sem); - - return rc; -} -EXPORT_SYMBOL(libcfs_nal_cmd_register); - -void -libcfs_nal_cmd_unregister(int nal) -{ - struct nal_cmd_handler *cmd; - - CDEBUG(D_IOCTL, "Unregister NAL %x\n", nal); - - down(&nal_cmd_sem); - cmd = libcfs_find_nal_cmd_handler(nal); - LASSERT (cmd != NULL); - cmd->nch_handler = NULL; - cmd->nch_private = NULL; - up(&nal_cmd_sem); -} -EXPORT_SYMBOL(libcfs_nal_cmd_unregister); - -int -libcfs_nal_cmd(struct portals_cfg *pcfg) -{ -#if CRAY_PORTALS - /* pretend success */ - RETURN(0); -#else - struct nal_cmd_handler *cmd; - __u32 nal = pcfg->pcfg_nal; - int rc = -EINVAL; - ENTRY; - - down(&nal_cmd_sem); - cmd = libcfs_find_nal_cmd_handler(nal); - if (cmd != NULL) { - CDEBUG(D_IOCTL, "calling handler nal: %x, cmd: %d\n", nal, - pcfg->pcfg_command); - rc = cmd->nch_handler(pcfg, cmd->nch_private); - } else { - CERROR("invalid nal: %x, cmd: %d\n", nal, pcfg->pcfg_command); - } - up(&nal_cmd_sem); - - RETURN(rc); -#endif -} -EXPORT_SYMBOL(libcfs_nal_cmd); - -static DECLARE_RWSEM(ioctl_list_sem); -static LIST_HEAD(ioctl_list); - -int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand) -{ - int rc = 0; - down_read(&ioctl_list_sem); - if (!list_empty(&hand->item)) - rc = -EBUSY; - up_read(&ioctl_list_sem); - - if (rc == 0) { - down_write(&ioctl_list_sem); - list_add_tail(&hand->item, &ioctl_list); - up_write(&ioctl_list_sem); - } - RETURN(0); -} -EXPORT_SYMBOL(libcfs_register_ioctl); - -int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand) -{ - int rc = 0; - down_read(&ioctl_list_sem); - if (list_empty(&hand->item)) - rc = -ENOENT; - up_read(&ioctl_list_sem); - - if (rc == 0) { - down_write(&ioctl_list_sem); - list_del_init(&hand->item); - up_write(&ioctl_list_sem); - } - RETURN(0); -} -EXPORT_SYMBOL(libcfs_deregister_ioctl); - -static int libcfs_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err = -EINVAL; - char buf[1024]; - struct portal_ioctl_data *data; - ENTRY; - - if (current->fsuid != 0) - RETURN(err = -EACCES); - - if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || - _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || - _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { - CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", - _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); - RETURN(-EINVAL); - } - - if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { - CERROR("PORTALS ioctl: data error\n"); - RETURN(-EINVAL); - } - - data = (struct portal_ioctl_data *)buf; - - switch (cmd) { - case IOC_PORTAL_CLEAR_DEBUG: - portals_debug_clear_buffer(); - RETURN(0); - case IOC_PORTAL_PANIC: - if (!capable (CAP_SYS_BOOT)) - RETURN (-EPERM); - panic("debugctl-invoked panic"); - RETURN(0); - case IOC_PORTAL_MARK_DEBUG: - if (data->ioc_inlbuf1 == NULL || - data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') - RETURN(-EINVAL); - portals_debug_mark_buffer(data->ioc_inlbuf1); - RETURN(0); -#if LWT_SUPPORT - case IOC_PORTAL_LWT_CONTROL: - err = lwt_control (data->ioc_flags, data->ioc_misc); - break; - - case IOC_PORTAL_LWT_SNAPSHOT: { - cycles_t now; - int ncpu; - int total_size; - - err = lwt_snapshot (&now, &ncpu, &total_size, - data->ioc_pbuf1, data->ioc_plen1); - data->ioc_nid = now; - data->ioc_count = ncpu; - data->ioc_misc = total_size; - - /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ - data->ioc_nid2 = sizeof(lwt_event_t); - data->ioc_nid3 = offsetof(lwt_event_t, lwte_where); - - if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) - err = -EFAULT; - break; - } - - case IOC_PORTAL_LWT_LOOKUP_STRING: - err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, - data->ioc_pbuf2, data->ioc_plen2); - if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) - err = -EFAULT; - break; -#endif - case IOC_PORTAL_NAL_CMD: { - struct portals_cfg pcfg; - - if (data->ioc_plen1 != sizeof(pcfg)) { - CERROR("Bad ioc_plen1 %d (wanted %d)\n", - data->ioc_plen1, sizeof(pcfg)); - err = -EINVAL; - break; - } - - if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, - sizeof(pcfg))) { - err = -EFAULT; - break; - } - - CDEBUG (D_IOCTL, "nal command nal %x cmd %d\n", pcfg.pcfg_nal, - pcfg.pcfg_command); - err = libcfs_nal_cmd(&pcfg); - - if (err == 0 && - copy_to_user((char *)data->ioc_pbuf1, &pcfg, - sizeof (pcfg))) - err = -EFAULT; - break; - } - - case IOC_PORTAL_MEMHOG: - if (!capable (CAP_SYS_ADMIN)) - err = -EPERM; - else if (file->private_data == NULL) { - err = -EINVAL; - } else { - kportal_memhog_free(file->private_data); - err = kportal_memhog_alloc(file->private_data, - data->ioc_count, - data->ioc_flags); - if (err != 0) - kportal_memhog_free(file->private_data); - } - break; - - default: { - struct libcfs_ioctl_handler *hand; - err = -EINVAL; - down_read(&ioctl_list_sem); - list_for_each_entry(hand, &ioctl_list, item) { - err = hand->handle_ioctl(data, cmd, arg); - if (err != -EINVAL) - break; - } - up_read(&ioctl_list_sem); - } break; - } - - RETURN(err); -} - - -static struct file_operations libcfs_fops = { - ioctl: libcfs_ioctl, - open: libcfs_psdev_open, - release: libcfs_psdev_release -}; - - -static struct miscdevice libcfs_dev = { - PORTAL_MINOR, - "portals", - &libcfs_fops -}; - -extern int insert_proc(void); -extern void remove_proc(void); -MODULE_AUTHOR("Peter J. Braam "); -MODULE_DESCRIPTION("Portals v3.1"); -MODULE_LICENSE("GPL"); - -static int init_libcfs_module(void) -{ - int rc; - - rc = portals_debug_init(5 * 1024 * 1024); - if (rc < 0) { - printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc); - return (rc); - } - -#if LWT_SUPPORT - rc = lwt_init(); - if (rc != 0) { - CERROR("lwt_init: error %d\n", rc); - goto cleanup_debug; - } -#endif - rc = misc_register(&libcfs_dev); - if (rc) { - CERROR("misc_register: error %d\n", rc); - goto cleanup_lwt; - } - - rc = insert_proc(); - if (rc) { - CERROR("insert_proc: error %d\n", rc); - goto cleanup_deregister; - } - - CDEBUG (D_OTHER, "portals setup OK\n"); - return (0); - - cleanup_deregister: - misc_deregister(&libcfs_dev); - cleanup_lwt: -#if LWT_SUPPORT - lwt_fini(); - cleanup_debug: -#endif - portals_debug_cleanup(); - return rc; -} - -static void exit_libcfs_module(void) -{ - int rc; - - remove_proc(); - - CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); - - rc = misc_deregister(&libcfs_dev); - if (rc) - CERROR("misc_deregister error %d\n", rc); - -#if LWT_SUPPORT - lwt_fini(); -#endif - - if (atomic_read(&portal_kmemory) != 0) - CERROR("Portals memory leaked: %d bytes\n", - atomic_read(&portal_kmemory)); - - rc = portals_debug_cleanup(); - if (rc) - printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc); -} - -EXPORT_SYMBOL(kportal_daemonize); -EXPORT_SYMBOL(kportal_blockallsigs); -EXPORT_SYMBOL(kportal_assertion_failed); - -module_init(init_libcfs_module); -module_exit(exit_libcfs_module); diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c deleted file mode 100644 index 08446a0..0000000 --- a/lustre/portals/libcfs/proc.c +++ /dev/null @@ -1,321 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -# define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include "tracefile.h" - -static struct ctl_table_header *portals_table_header = NULL; -extern char debug_file_path[1024]; -extern char portals_upcall[1024]; - -#define PSDEV_PORTALS (0x100) -enum { - PSDEV_DEBUG = 1, /* control debugging */ - PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ - PSDEV_PRINTK, /* force all errors to console */ - PSDEV_CONSOLE, /* allow _any_ messages to console */ - PSDEV_DEBUG_PATH, /* crashdump log location */ - PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ - PSDEV_PORTALS_UPCALL, /* User mode upcall script */ - PSDEV_PORTALS_MEMUSED, /* bytes currently PORTAL_ALLOCated */ -}; - -static struct ctl_table portals_table[] = { - {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, - &proc_dointvec}, - {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, - sizeof(int), 0644, NULL, &proc_dointvec}, - {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, - &proc_dointvec}, - {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, - sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, - {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, - sizeof(portals_upcall), 0644, NULL, &proc_dostring, - &sysctl_string}, - {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter, - sizeof(int), 0644, NULL, &proc_dointvec}, - {0} -}; - -static struct ctl_table top_table[2] = { - {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, - {0} -}; - - -#ifdef PORTALS_PROFILING -/* - * profiling stuff. we do this statically for now 'cause its simple, - * but we could do some tricks with elf sections to have this array - * automatically built. - */ -#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } - -struct prof_ent prof_ents[] = { - def_prof(our_recvmsg), - def_prof(our_sendmsg), - def_prof(socknal_recv), - def_prof(lib_parse), - def_prof(conn_list_walk), - def_prof(memcpy), - def_prof(lib_finalize), - def_prof(pingcli_time), - def_prof(gmnal_send), - def_prof(gmnal_recv), -}; - -EXPORT_SYMBOL(prof_ents); - -/* - * this function is as crazy as the proc filling api - * requires. - * - * buffer: page allocated for us to scribble in. the - * data returned to the user will be taken from here. - * *start: address of the pointer that will tell the - * caller where in buffer the data the user wants is. - * ppos: offset in the entire /proc file that the user - * currently wants. - * wanted: the amount of data the user wants. - * - * while going, 'curpos' is the offset in the entire - * file where we currently are. We only actually - * start filling buffer when we get to a place in - * the file that the user cares about. - * - * we take care to only sprintf when the user cares because - * we're holding a lock while we do this. - * - * we're smart and know that we generate fixed size lines. - * we only start writing to the buffer when the user cares. - * This is unpredictable because we don't snapshot the - * list between calls that are filling in a file from - * the list. The list could change mid read and the - * output will look very weird indeed. oh well. - */ - -static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, - int *eof, void *data) -{ - int len = 0, i; - int curpos; - char *header = "Interval Cycles_per (Starts Finishes Total)\n"; - int header_len = strlen(header); - char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; - int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); - - *start = buffer; - - if (ppos < header_len) { - int diff = MIN(header_len, wanted); - memcpy(buffer, header + ppos, diff); - len += diff; - ppos += diff; - } - - if (len >= wanted) - goto out; - - curpos = header_len; - - for ( i = 0; i < MAX_PROFS ; i++) { - int copied; - struct prof_ent *pe = &prof_ents[i]; - long long cycles_per; - /* - * find the part of the array that the buffer wants - */ - if (ppos >= (curpos + line_len)) { - curpos += line_len; - continue; - } - /* the clever caller split a line */ - if (ppos > curpos) { - *start = buffer + (ppos - curpos); - } - - if (pe->finishes == 0) - cycles_per = 0; - else - { - cycles_per = pe->total_cycles; - do_div (cycles_per, pe->finishes); - } - - copied = sprintf(buffer + len, format, pe->str, cycles_per, - pe->starts, pe->finishes, pe->total_cycles); - - len += copied; - - /* pad to line len, -1 for \n */ - if ((copied < line_len-1)) { - int diff = (line_len-1) - copied; - memset(buffer + len, ' ', diff); - len += diff; - copied += diff; - } - - buffer[len++]= '\n'; - - /* bail if we have enough */ - if (((buffer + len) - *start) >= wanted) - break; - - curpos += line_len; - } - - /* lameness */ - if (i == MAX_PROFS) - *eof = 1; - out: - - return MIN(((buffer + len) - *start), wanted); -} - -/* - * all kids love /proc :/ - */ -static unsigned char basedir[]="net/portals"; -#endif /* PORTALS_PROFILING */ - -int insert_proc(void) -{ - struct proc_dir_entry *ent; -#if PORTALS_PROFILING - unsigned char dir[128]; - - if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { - CERROR("profiling enum and array are out of sync.\n"); - return -1; - } - - /* - * This is pretty lame. assuming that failure just - * means that they already existed. - */ - strcat(dir, basedir); - create_proc_entry(dir, S_IFDIR, 0); - - strcat(dir, "/cycles"); - ent = create_proc_entry(dir, 0, 0); - if (!ent) { - CERROR("couldn't register %s?\n", dir); - return -1; - } - - ent->data = NULL; - ent->read_proc = prof_read_proc; -#endif /* PORTALS_PROFILING */ - -#ifdef CONFIG_SYSCTL - if (!portals_table_header) - portals_table_header = register_sysctl_table(top_table, 0); -#endif - - ent = create_proc_entry("sys/portals/dump_kernel", 0, NULL); - if (ent == NULL) { - CERROR("couldn't register dump_kernel\n"); - return -1; - } - ent->write_proc = trace_dk; - - ent = create_proc_entry("sys/portals/daemon_file", 0, NULL); - if (ent == NULL) { - CERROR("couldn't register daemon_file\n"); - return -1; - } - ent->write_proc = trace_write_daemon_file; - ent->read_proc = trace_read_daemon_file; - - ent = create_proc_entry("sys/portals/debug_mb", 0, NULL); - if (ent == NULL) { - CERROR("couldn't register debug_mb\n"); - return -1; - } - ent->write_proc = trace_write_debug_mb; - ent->read_proc = trace_read_debug_mb; - - return 0; -} - -void remove_proc(void) -{ -#if PORTALS_PROFILING - unsigned char dir[128]; - int end; - - dir[0]='\0'; - strcat(dir, basedir); - - end = strlen(dir); - - strcat(dir, "/cycles"); - remove_proc_entry(dir, 0); - - dir[end] = '\0'; - remove_proc_entry(dir, 0); -#endif /* PORTALS_PROFILING */ - - remove_proc_entry("sys/portals/dump_kernel", NULL); - remove_proc_entry("sys/portals/daemon_file", NULL); - remove_proc_entry("sys/portals/debug_mb", NULL); - -#ifdef CONFIG_SYSCTL - if (portals_table_header) - unregister_sysctl_table(portals_table_header); - portals_table_header = NULL; -#endif -} diff --git a/lustre/portals/libcfs/tracefile.c b/lustre/portals/libcfs/tracefile.c deleted file mode 100644 index f0c06e5..0000000 --- a/lustre/portals/libcfs/tracefile.c +++ /dev/null @@ -1,876 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Zach Brown - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_MM_INLINE -#include -#endif - -#define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include - -#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) - -/* XXX move things up to the top, comment */ - -static union { - struct trace_cpu_data { - struct list_head tcd_pages; - unsigned long tcd_cur_pages; - - struct list_head tcd_daemon_pages; - unsigned long tcd_cur_daemon_pages; - - unsigned long tcd_max_pages; - int tcd_shutting_down; - } tcd; - char __pad[SMP_CACHE_BYTES]; -} trace_data[NR_CPUS] __cacheline_aligned; - -struct page_collection { - struct list_head pc_pages; - spinlock_t pc_lock; - int pc_want_daemon_pages; -}; - -struct tracefiled_ctl { - struct completion tctl_start; - struct completion tctl_stop; - wait_queue_head_t tctl_waitq; - pid_t tctl_pid; - atomic_t tctl_shutdown; -}; - -#define TRACEFILE_SIZE (500 << 20) -static DECLARE_RWSEM(tracefile_sem); -static char *tracefile = NULL; -static long long tracefile_size = TRACEFILE_SIZE; -static struct tracefiled_ctl trace_tctl; -static DECLARE_MUTEX(trace_thread_sem); -static int thread_running = 0; - -#ifndef get_cpu -#define get_cpu() smp_processor_id() -#define put_cpu() do { } while (0) -#endif - -#define trace_get_tcd(FLAGS) ({ \ - struct trace_cpu_data *__ret; \ - int __cpu = get_cpu(); \ - local_irq_save(FLAGS); \ - __ret = &trace_data[__cpu].tcd; \ - __ret; \ -}) - -#define trace_put_tcd(TCD, FLAGS) do { \ - local_irq_restore(FLAGS); \ - put_cpu(); \ -} while (0) - -static void put_pages_on_daemon_list_on_cpu(void *info); - -/* return a page that has 'len' bytes left at the end */ -static struct page *trace_get_page(struct trace_cpu_data *tcd, - unsigned long len) -{ - struct page *page = NULL; - - if (len > PAGE_SIZE) { - printk(KERN_ERR "cowardly refusing to write %lu bytes in a " - "page\n", len); - return NULL; - } - - if (!list_empty(&tcd->tcd_pages)) { - page = list_entry(tcd->tcd_pages.prev, struct page, - PAGE_LIST_ENTRY); - if (page->index + len <= PAGE_SIZE) - return page; - } - - if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { - page = alloc_page(GFP_ATOMIC); - if (page == NULL) { - /* the kernel should print a message for us. fall back - * to using the last page in the ring buffer. */ - goto ring_buffer; - } - page->index = 0; - page->mapping = (void *)(long)smp_processor_id(); - list_add_tail(&PAGE_LIST(page), &tcd->tcd_pages); - tcd->tcd_cur_pages++; - - if (tcd->tcd_cur_pages > 8 && thread_running) { - struct tracefiled_ctl *tctl = &trace_tctl; - wake_up(&tctl->tctl_waitq); - } - return page; - } - - ring_buffer: - if (thread_running) { - int pgcount = tcd->tcd_cur_pages / 10; - struct page_collection pc; - struct list_head *pos, *tmp; - printk(KERN_WARNING "debug daemon buffer overflowed; discarding" - " 10%% of pages (%d)\n", pgcount + 1); - - INIT_LIST_HEAD(&pc.pc_pages); - spin_lock_init(&pc.pc_lock); - - list_for_each_safe(pos, tmp, &tcd->tcd_pages) { - struct page *page; - - if (pgcount-- == 0) - break; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - list_del(&PAGE_LIST(page)); - list_add_tail(&PAGE_LIST(page), &pc.pc_pages); - tcd->tcd_cur_pages--; - } - put_pages_on_daemon_list_on_cpu(&pc); - } - LASSERT(!list_empty(&tcd->tcd_pages)); - - page = list_entry(tcd->tcd_pages.next, struct page, PAGE_LIST_ENTRY); - page->index = 0; - - list_del(&PAGE_LIST(page)); - list_add_tail(&PAGE_LIST(page), &tcd->tcd_pages); - return page; -} - -static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, - int len, char *file, const char *fn) -{ - char *prefix = NULL, *ptype = NULL; - - if ((mask & D_EMERG) != 0) { - prefix = "LustreError"; - ptype = KERN_EMERG; - } else if ((mask & D_ERROR) != 0) { - prefix = "LustreError"; - ptype = KERN_ERR; - } else if ((mask & D_WARNING) != 0) { - prefix = "Lustre"; - ptype = KERN_WARNING; - } else if (portal_printk) { - prefix = "Lustre"; - ptype = KERN_INFO; - } - - printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, - hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); -} - -void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, char *format, ...) -{ - struct trace_cpu_data *tcd; - struct ptldebug_header header; - struct page *page; - char *debug_buf = format; - int known_size, needed = 85 /* average message length */, max_nob; - va_list ap; - unsigned long flags; - struct timeval tv; - - if (*(format + strlen(format) - 1) != '\n') - printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n", - file, line, fn); - - tcd = trace_get_tcd(flags); - if (tcd->tcd_shutting_down) - goto out; - - do_gettimeofday(&tv); - - header.ph_subsys = subsys; - header.ph_mask = mask; - header.ph_cpu_id = smp_processor_id(); - header.ph_sec = (__u32)tv.tv_sec; - header.ph_usec = tv.tv_usec; - header.ph_stack = stack; - header.ph_pid = current->pid; - header.ph_line_num = line; - -#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) - header.ph_extern_pid = current->thread.extern_pid; -#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - header.ph_extern_pid = current->thread.mode.tt.extern_pid; -#else - header.ph_extern_pid = 0; -#endif - - known_size = sizeof(header) + strlen(file) + strlen(fn) + 2; // nulls - - retry: - page = trace_get_page(tcd, needed + known_size); - if (page == NULL) { - debug_buf = format; - if (needed + known_size > PAGE_SIZE) - mask |= D_ERROR; - needed = strlen(format); - goto out; - } - - debug_buf = page_address(page) + page->index + known_size; - - max_nob = PAGE_SIZE - page->index - known_size; - LASSERT(max_nob > 0); - va_start(ap, format); - needed = vsnprintf(debug_buf, max_nob, format, ap); - va_end(ap); - - if (needed > max_nob) /* overflow. oh poop. */ - goto retry; - - header.ph_len = known_size + needed; - debug_buf = page_address(page) + page->index; - - memcpy(debug_buf, &header, sizeof(header)); - page->index += sizeof(header); - debug_buf += sizeof(header); - - strcpy(debug_buf, file); - page->index += strlen(file) + 1; - debug_buf += strlen(file) + 1; - - strcpy(debug_buf, fn); - page->index += strlen(fn) + 1; - debug_buf += strlen(fn) + 1; - - page->index += needed; - if (page->index > PAGE_SIZE) - printk(KERN_EMERG "page->index == %lu in portals_debug_msg\n", - page->index); - - out: - if ((mask & (D_EMERG | D_ERROR | D_WARNING)) || portal_printk) - print_to_console(&header, mask, debug_buf, needed, file, fn); - - trace_put_tcd(tcd, flags); -} -EXPORT_SYMBOL(portals_debug_msg); - -static void collect_pages_on_cpu(void *info) -{ - struct trace_cpu_data *tcd; - unsigned long flags; - struct page_collection *pc = info; - - tcd = trace_get_tcd(flags); - - spin_lock(&pc->pc_lock); - list_splice(&tcd->tcd_pages, &pc->pc_pages); - INIT_LIST_HEAD(&tcd->tcd_pages); - tcd->tcd_cur_pages = 0; - if (pc->pc_want_daemon_pages) { - list_splice(&tcd->tcd_daemon_pages, &pc->pc_pages); - INIT_LIST_HEAD(&tcd->tcd_daemon_pages); - tcd->tcd_cur_daemon_pages = 0; - } - spin_unlock(&pc->pc_lock); - - trace_put_tcd(tcd, flags); -} - -static void collect_pages(struct page_collection *pc) -{ - /* needs to be fixed up for preempt */ - INIT_LIST_HEAD(&pc->pc_pages); - collect_pages_on_cpu(pc); - smp_call_function(collect_pages_on_cpu, pc, 0, 1); -} - -static void put_pages_back_on_cpu(void *info) -{ - struct page_collection *pc = info; - struct trace_cpu_data *tcd; - struct list_head *pos, *tmp, *cur_head; - unsigned long flags; - - tcd = trace_get_tcd(flags); - - cur_head = tcd->tcd_pages.next; - - spin_lock(&pc->pc_lock); - list_for_each_safe(pos, tmp, &pc->pc_pages) { - struct page *page; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - if ((unsigned long)page->mapping != smp_processor_id()) - continue; - - list_del(&PAGE_LIST(page)); - list_add_tail(&PAGE_LIST(page), cur_head); - tcd->tcd_cur_pages++; - } - spin_unlock(&pc->pc_lock); - - trace_put_tcd(tcd, flags); -} - -static void put_pages_back(struct page_collection *pc) -{ - /* needs to be fixed up for preempt */ - put_pages_back_on_cpu(pc); - smp_call_function(put_pages_back_on_cpu, pc, 0, 1); -} - -/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that - * we have a good amount of data at all times for dumping during an LBUG, even - * if we have been steadily writing (and otherwise discarding) pages via the - * debug daemon. */ -static void put_pages_on_daemon_list_on_cpu(void *info) -{ - struct page_collection *pc = info; - struct trace_cpu_data *tcd; - struct list_head *pos, *tmp; - unsigned long flags; - - tcd = trace_get_tcd(flags); - - spin_lock(&pc->pc_lock); - list_for_each_safe(pos, tmp, &pc->pc_pages) { - struct page *page; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - if ((unsigned long)page->mapping != smp_processor_id()) - continue; - - list_del(&PAGE_LIST(page)); - list_add_tail(&PAGE_LIST(page), &tcd->tcd_daemon_pages); - tcd->tcd_cur_daemon_pages++; - - if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { - LASSERT(!list_empty(&tcd->tcd_daemon_pages)); - page = list_entry(tcd->tcd_daemon_pages.next, - struct page, PAGE_LIST_ENTRY); - - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - page->index = 0; - list_del(&PAGE_LIST(page)); - page->mapping = NULL; - __free_page(page); - tcd->tcd_cur_daemon_pages--; - } - } - spin_unlock(&pc->pc_lock); - - trace_put_tcd(tcd, flags); -} - -static void put_pages_on_daemon_list(struct page_collection *pc) -{ - put_pages_on_daemon_list_on_cpu(pc); - smp_call_function(put_pages_on_daemon_list_on_cpu, pc, 0, 1); -} - -void trace_debug_print(void) -{ - struct page_collection pc; - struct list_head *pos, *tmp; - - spin_lock_init(&pc.pc_lock); - - collect_pages(&pc); - list_for_each_safe(pos, tmp, &pc.pc_pages) { - struct page *page; - char *p, *file, *fn; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - p = page_address(page); - while (p < ((char *)page_address(page) + PAGE_SIZE)) { - struct ptldebug_header *hdr; - int len; - hdr = (void *)p; - p += sizeof(*hdr); - file = p; - p += strlen(file) + 1; - fn = p; - p += strlen(fn) + 1; - len = hdr->ph_len - (p - (char *)hdr); - - print_to_console(hdr, D_EMERG, p, len, file, fn); - } - - list_del(&PAGE_LIST(page)); - page->mapping = NULL; - __free_page(page); - } -} - -int tracefile_dump_all_pages(char *filename) -{ - struct page_collection pc; - struct file *filp; - struct list_head *pos, *tmp; - mm_segment_t oldfs; - int rc; - - down_write(&tracefile_sem); - - filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600); - if (IS_ERR(filp)) { - rc = PTR_ERR(filp); - printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", - filename, rc); - goto out; - } - - spin_lock_init(&pc.pc_lock); - pc.pc_want_daemon_pages = 1; - collect_pages(&pc); - if (list_empty(&pc.pc_pages)) { - rc = 0; - goto close; - } - - /* ok, for now, just write the pages. in the future we'll be building - * iobufs with the pages and calling generic_direct_IO */ - oldfs = get_fs(); - set_fs(get_ds()); - list_for_each_safe(pos, tmp, &pc.pc_pages) { - struct page *page; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - rc = filp->f_op->write(filp, page_address(page), page->index, - &filp->f_pos); - if (rc != page->index) { - printk(KERN_WARNING "wanted to write %lu but wrote " - "%d\n", page->index, rc); - put_pages_back(&pc); - break; - } - list_del(&PAGE_LIST(page)); - page->mapping = NULL; - __free_page(page); - } - set_fs(oldfs); - rc = filp->f_op->fsync(filp, filp->f_dentry, 1); - if (rc) - printk(KERN_ERR "sync returns %d\n", rc); - close: - filp_close(filp, 0); - out: - up_write(&tracefile_sem); - return rc; -} - -void trace_flush_pages(void) -{ - struct page_collection pc; - struct list_head *pos, *tmp; - - spin_lock_init(&pc.pc_lock); - - collect_pages(&pc); - list_for_each_safe(pos, tmp, &pc.pc_pages) { - struct page *page; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - list_del(&PAGE_LIST(page)); - page->mapping = NULL; - __free_page(page); - } -} - -int trace_dk(struct file *file, const char *buffer, unsigned long count, - void *data) -{ - char *name; - unsigned long off; - int rc; - - name = kmalloc(count + 1, GFP_KERNEL); - if (name == NULL) - return -ENOMEM; - - if (copy_from_user(name, buffer, count)) { - rc = -EFAULT; - goto out; - } - - if (name[0] != '/') { - rc = -EINVAL; - goto out; - } - - /* be nice and strip out trailing '\n' */ - for (off = count ; off > 2 && isspace(name[off - 1]); off--) - ; - - name[off] = '\0'; - rc = tracefile_dump_all_pages(name); -out: - if (name) - kfree(name); - return count; -} -EXPORT_SYMBOL(trace_dk); - -static int tracefiled(void *arg) -{ - struct page_collection pc; - struct tracefiled_ctl *tctl = arg; - struct list_head *pos, *tmp; - struct ptldebug_header *hdr; - struct file *filp; - struct page *page; - mm_segment_t oldfs; - int rc; - - /* we're started late enough that we pick up init's fs context */ - /* this is so broken in uml? what on earth is going on? */ - kportal_daemonize("ktracefiled"); - reparent_to_init(); - - spin_lock_init(&pc.pc_lock); - complete(&tctl->tctl_start); - - while (1) { - wait_queue_t __wait; - - init_waitqueue_entry(&__wait, current); - add_wait_queue(&tctl->tctl_waitq, &__wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - remove_wait_queue(&tctl->tctl_waitq, &__wait); - - if (atomic_read(&tctl->tctl_shutdown)) - break; - - pc.pc_want_daemon_pages = 0; - collect_pages(&pc); - if (list_empty(&pc.pc_pages)) - continue; - - filp = NULL; - down_read(&tracefile_sem); - if (tracefile != NULL) { - filp = filp_open(tracefile, O_CREAT|O_RDWR|O_LARGEFILE, - 0600); - if (IS_ERR(filp)) { - printk("couldn't open %s: %ld\n", tracefile, - PTR_ERR(filp)); - filp = NULL; - } - } - up_read(&tracefile_sem); - if (filp == NULL) { - put_pages_on_daemon_list(&pc); - continue; - } - - oldfs = get_fs(); - set_fs(get_ds()); - - /* mark the first header, so we can sort in chunks */ - page = list_entry(pc.pc_pages.next, struct page, - PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - hdr = page_address(page); - hdr->ph_flags |= PH_FLAG_FIRST_RECORD; - - list_for_each_safe(pos, tmp, &pc.pc_pages) { - static loff_t f_pos; - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - if (f_pos >= tracefile_size) - f_pos = 0; - else if (f_pos > filp->f_dentry->d_inode->i_size) - f_pos = filp->f_dentry->d_inode->i_size; - - rc = filp->f_op->write(filp, page_address(page), - page->index, &f_pos); - if (rc != page->index) { - printk(KERN_WARNING "wanted to write %lu but " - "wrote %d\n", page->index, rc); - put_pages_back(&pc); - } - } - set_fs(oldfs); - filp_close(filp, 0); - - put_pages_on_daemon_list(&pc); - } - complete(&tctl->tctl_stop); - return 0; -} - -int trace_start_thread(void) -{ - struct tracefiled_ctl *tctl = &trace_tctl; - int rc = 0; - - down(&trace_thread_sem); - if (thread_running) - goto out; - - init_completion(&tctl->tctl_start); - init_completion(&tctl->tctl_stop); - init_waitqueue_head(&tctl->tctl_waitq); - atomic_set(&tctl->tctl_shutdown, 0); - - if (kernel_thread(tracefiled, tctl, 0) < 0) { - rc = -ECHILD; - goto out; - } - - wait_for_completion(&tctl->tctl_start); - thread_running = 1; -out: - up(&trace_thread_sem); - return rc; -} - -void trace_stop_thread(void) -{ - struct tracefiled_ctl *tctl = &trace_tctl; - - down(&trace_thread_sem); - if (thread_running) { - printk(KERN_INFO "Shutting down debug daemon thread...\n"); - atomic_set(&tctl->tctl_shutdown, 1); - wait_for_completion(&tctl->tctl_stop); - thread_running = 0; - } - up(&trace_thread_sem); -} - -int trace_write_daemon_file(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - char *name; - unsigned long off; - int rc; - - name = kmalloc(count + 1, GFP_KERNEL); - if (name == NULL) - return -ENOMEM; - - if (copy_from_user(name, buffer, count)) { - rc = -EFAULT; - goto out; - } - - /* be nice and strip out trailing '\n' */ - for (off = count ; off > 2 && isspace(name[off - 1]); off--) - ; - - name[off] = '\0'; - - down_write(&tracefile_sem); - if (strcmp(name, "stop") == 0) { - tracefile = NULL; - trace_stop_thread(); - goto out_sem; - } else if (strncmp(name, "size=", 5) == 0) { - tracefile_size = simple_strtoul(name + 5, NULL, 0); - if (tracefile_size < 10 || tracefile_size > 20480) - tracefile_size = TRACEFILE_SIZE; - else - tracefile_size <<= 20; - goto out_sem; - } - - if (name[0] != '/') { - rc = -EINVAL; - goto out_sem; - } - - if (tracefile != NULL) - kfree(tracefile); - - tracefile = name; - name = NULL; - - printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " - "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); - - trace_start_thread(); - - out_sem: - up_write(&tracefile_sem); - - out: - kfree(name); - return count; -} - -int trace_read_daemon_file(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - int rc; - - down_read(&tracefile_sem); - rc = snprintf(page, count, "%s", tracefile); - up_read(&tracefile_sem); - - return rc; -} - -int trace_write_debug_mb(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - char string[32]; - int i; - unsigned max; - - if (count >= sizeof(string)) { - printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", - count); - return -EOVERFLOW; - } - - if (copy_from_user(string, buffer, count)) - return -EFAULT; - - max = simple_strtoul(string, NULL, 0); - if (max == 0) - return -EINVAL; - - if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) { - printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " - "%dMB, which is more than 80%% of available RAM (%lu)\n", - max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5); - return -EINVAL; - } - - max /= smp_num_cpus; - - for (i = 0; i < NR_CPUS; i++) { - struct trace_cpu_data *tcd; - tcd = &trace_data[i].tcd; - tcd->tcd_max_pages = max << (20 - PAGE_SHIFT); - } - return count; -} - -int trace_read_debug_mb(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct trace_cpu_data *tcd; - unsigned long flags; - int rc; - - tcd = trace_get_tcd(flags); - rc = snprintf(page, count, "%lu\n", - (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus); - trace_put_tcd(tcd, flags); - - return rc; -} - -int tracefile_init(void) -{ - struct trace_cpu_data *tcd; - int i; - - for (i = 0; i < NR_CPUS; i++) { - tcd = &trace_data[i].tcd; - INIT_LIST_HEAD(&tcd->tcd_pages); - INIT_LIST_HEAD(&tcd->tcd_daemon_pages); - tcd->tcd_cur_pages = 0; - tcd->tcd_cur_daemon_pages = 0; - tcd->tcd_max_pages = TCD_MAX_PAGES; - tcd->tcd_shutting_down = 0; - } - return 0; -} - -static void trace_cleanup_on_cpu(void *info) -{ - struct trace_cpu_data *tcd; - struct list_head *pos, *tmp; - unsigned long flags; - - tcd = trace_get_tcd(flags); - - tcd->tcd_shutting_down = 1; - - list_for_each_safe(pos, tmp, &tcd->tcd_pages) { - struct page *page; - - page = list_entry(pos, struct page, PAGE_LIST_ENTRY); - LASSERT(page->index <= PAGE_SIZE); - LASSERT(page_count(page) > 0); - - list_del(&PAGE_LIST(page)); - page->mapping = NULL; - __free_page(page); - } - tcd->tcd_cur_pages = 0; - - trace_put_tcd(tcd, flags); -} - -static void trace_cleanup(void) -{ - struct page_collection pc; - - INIT_LIST_HEAD(&pc.pc_pages); - spin_lock_init(&pc.pc_lock); - - trace_cleanup_on_cpu(&pc); - smp_call_function(trace_cleanup_on_cpu, &pc, 0, 1); -} - -void tracefile_exit(void) -{ - trace_stop_thread(); - trace_cleanup(); -} diff --git a/lustre/portals/libcfs/tracefile.h b/lustre/portals/libcfs/tracefile.h deleted file mode 100644 index f581257..0000000 --- a/lustre/portals/libcfs/tracefile.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __PORTALS_TRACEFILE_H -#define __PORTALS_TRACEFILE_H - -int tracefile_dump_all_pages(char *filename); -void trace_debug_print(void); -void trace_flush_pages(void); -int trace_start_thread(void); -void trace_stop_thread(void); -int tracefile_init(void); -void tracefile_exit(void); -int trace_write_daemon_file(struct file *file, const char *buffer, - unsigned long count, void *data); -int trace_read_daemon_file(char *page, char **start, off_t off, int count, - int *eof, void *data); -int trace_write_debug_mb(struct file *file, const char *buffer, - unsigned long count, void *data); -int trace_read_debug_mb(char *page, char **start, off_t off, int count, - int *eof, void *data); -int trace_dk(struct file *file, const char *buffer, unsigned long count, - void *data); - -#endif /* __PORTALS_TRACEFILE_H */ diff --git a/lustre/portals/libcfs/watchdog.c b/lustre/portals/libcfs/watchdog.c deleted file mode 100644 index 844845a..0000000 --- a/lustre/portals/libcfs/watchdog.c +++ /dev/null @@ -1,402 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Jacob Berkman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include - - - -struct lc_watchdog { - struct timer_list lcw_timer; /* kernel timer */ - struct list_head lcw_list; - struct timeval lcw_last_touched; - struct task_struct *lcw_task; - - void (*lcw_callback)(struct lc_watchdog *, - struct task_struct *, - void *data); - void *lcw_data; - - int lcw_pid; - int lcw_time; /* time until watchdog fires, in ms */ - - enum { - LC_WATCHDOG_DISABLED, - LC_WATCHDOG_ENABLED, - LC_WATCHDOG_EXPIRED - } lcw_state; -}; - -/* - * The dispatcher will complete lcw_start_completion when it starts, - * and lcw_stop_completion when it exits. - * Wake lcw_event_waitq to signal timer callback dispatches. - */ -static struct completion lcw_start_completion; -static struct completion lcw_stop_completion; -static wait_queue_head_t lcw_event_waitq; - -/* - * Set this and wake lcw_event_waitq to stop the dispatcher. - */ -enum { - LCW_FLAG_STOP = 0 -}; -static unsigned long lcw_flags = 0; - -/* - * Number of outstanding watchdogs. - * When it hits 1, we start the dispatcher. - * When it hits 0, we stop the distpatcher. - */ -static __u32 lcw_refcount = 0; -static DECLARE_MUTEX(lcw_refcount_sem); - -/* - * List of timers that have fired that need their callbacks run by the - * dispatcher. - */ -static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; -static struct list_head lcw_pending_timers = \ - LIST_HEAD_INIT(lcw_pending_timers); - -static struct task_struct *lcw_lookup_task(struct lc_watchdog *lcw) -{ - struct task_struct *tsk; - unsigned long flags; - ENTRY; - - read_lock_irqsave(&tasklist_lock, flags); - tsk = find_task_by_pid(lcw->lcw_pid); - read_unlock_irqrestore(&tasklist_lock, flags); - if (!tsk) { - CWARN("Process %d was not found in the task list; " - "watchdog callback may be incomplete\n", lcw->lcw_pid); - } else if (tsk != lcw->lcw_task) { - tsk = NULL; - CWARN("The current process %d did not set the watchdog; " - "watchdog callback may be incomplete\n", lcw->lcw_pid); - } - - RETURN(tsk); -} - -static void lcw_cb(unsigned long data) -{ - struct lc_watchdog *lcw = (struct lc_watchdog *)data; - struct task_struct *tsk; - unsigned long flags; - - ENTRY; - - if (lcw->lcw_state != LC_WATCHDOG_ENABLED) { - EXIT; - return; - } - - lcw->lcw_state = LC_WATCHDOG_EXPIRED; - - CWARN("Watchdog triggered for pid %d: it was inactive for %dus\n", - lcw->lcw_pid, (lcw->lcw_time * 1000) / HZ); - - tsk = lcw_lookup_task(lcw); - if (tsk != NULL) - portals_debug_dumpstack(tsk); - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - if (list_empty(&lcw->lcw_list)) { - list_add(&lcw->lcw_list, &lcw_pending_timers); - wake_up(&lcw_event_waitq); - } - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - - EXIT; -} - -static int is_watchdog_fired(void) -{ - unsigned long flags; - int rc; - - if (test_bit(LCW_FLAG_STOP, &lcw_flags)) - return 1; - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - rc = !list_empty(&lcw_pending_timers); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - return rc; -} - -static int lcw_dispatch_main(void *data) -{ - int rc = 0; - unsigned long flags; - struct lc_watchdog *lcw; - struct task_struct *tsk; - - ENTRY; - - kportal_daemonize("lc_watchdogd"); - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); - - complete(&lcw_start_completion); - - while (1) { - wait_event_interruptible(lcw_event_waitq, is_watchdog_fired()); - CDEBUG(D_INFO, "Watchdog got woken up...\n"); - if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { - CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n"); - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - rc = !list_empty(&lcw_pending_timers); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - if (rc) { - CERROR("pending timers list was not empty at " - "time of watchdog dispatch shutdown\n"); - } - break; - } - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - while (!list_empty(&lcw_pending_timers)) { - - lcw = list_entry(lcw_pending_timers.next, - struct lc_watchdog, - lcw_list); - list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - - CDEBUG(D_INFO, "found lcw for pid %d\n", lcw->lcw_pid); - - if (lcw->lcw_state != LC_WATCHDOG_DISABLED) { - /* - * sanity check the task against our - * watchdog - */ - tsk = lcw_lookup_task(lcw); - lcw->lcw_callback(lcw, tsk, lcw->lcw_data); - } - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - } - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - } - - complete(&lcw_stop_completion); - - RETURN(rc); -} - -static void lcw_dispatch_start(void) -{ - int rc; - - ENTRY; - LASSERT(lcw_refcount == 1); - - init_completion(&lcw_stop_completion); - init_completion(&lcw_start_completion); - init_waitqueue_head(&lcw_event_waitq); - - CDEBUG(D_INFO, "starting dispatch thread\n"); - rc = kernel_thread(lcw_dispatch_main, NULL, 0); - if (rc < 0) { - CERROR("error spawning watchdog dispatch thread: %d\n", rc); - EXIT; - return; - } - wait_for_completion(&lcw_start_completion); - CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n"); - - EXIT; -} - -static void lcw_dispatch_stop(void) -{ - ENTRY; - LASSERT(lcw_refcount == 0); - - CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); - - set_bit(LCW_FLAG_STOP, &lcw_flags); - wake_up(&lcw_event_waitq); - - wait_for_completion(&lcw_stop_completion); - - CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n"); - - EXIT; -} - -struct lc_watchdog *lc_watchdog_add(int time, - void (*callback)(struct lc_watchdog *, - struct task_struct *, - void *), - void *data) -{ - struct lc_watchdog *lcw = NULL; - ENTRY; - - PORTAL_ALLOC(lcw, sizeof(*lcw)); - if (!lcw) { - CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n"); - RETURN(ERR_PTR(-ENOMEM)); - } - - lcw->lcw_task = current; - lcw->lcw_pid = current->pid; - lcw->lcw_time = (time * HZ) / 1000; - lcw->lcw_callback = callback ? callback : lc_watchdog_dumplog; - lcw->lcw_data = data; - lcw->lcw_state = LC_WATCHDOG_DISABLED; - - INIT_LIST_HEAD(&lcw->lcw_list); - - lcw->lcw_timer.function = lcw_cb; - lcw->lcw_timer.data = (unsigned long)lcw; - lcw->lcw_timer.expires = jiffies + lcw->lcw_time; - init_timer(&lcw->lcw_timer); - - down(&lcw_refcount_sem); - if (++lcw_refcount == 1) - lcw_dispatch_start(); - up(&lcw_refcount_sem); - - /* Keep this working in case we enable them by default */ - if (lcw->lcw_state == LC_WATCHDOG_ENABLED) { - do_gettimeofday(&lcw->lcw_last_touched); - add_timer(&lcw->lcw_timer); - } - - RETURN(lcw); -} -EXPORT_SYMBOL(lc_watchdog_add); - -static long -timeval_sub(struct timeval *large, struct timeval *small) -{ - return (large->tv_sec - small->tv_sec) * 1000000 + - (large->tv_usec - small->tv_usec); -} - -static void lcw_update_time(struct lc_watchdog *lcw, const char *message) -{ - struct timeval newtime; - unsigned long timediff; - - do_gettimeofday(&newtime); - if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) { - timediff = timeval_sub(&newtime, &lcw->lcw_last_touched); - CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n", - lcw->lcw_pid, - message, - timediff / 1000000, - (timediff % 1000000) / 100); - } - lcw->lcw_last_touched = newtime; -} - -void lc_watchdog_touch(struct lc_watchdog *lcw) -{ - unsigned long flags; - ENTRY; - LASSERT(lcw != NULL); - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - if (!list_empty(&lcw->lcw_list)) - list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - - lcw_update_time(lcw, "touched"); - lcw->lcw_state = LC_WATCHDOG_ENABLED; - - mod_timer(&lcw->lcw_timer, jiffies + lcw->lcw_time); - - EXIT; -} -EXPORT_SYMBOL(lc_watchdog_touch); - -void lc_watchdog_disable(struct lc_watchdog *lcw) -{ - unsigned long flags; - ENTRY; - LASSERT(lcw != NULL); - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - if (!list_empty(&lcw->lcw_list)) - list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - - lcw_update_time(lcw, "disabled"); - lcw->lcw_state = LC_WATCHDOG_DISABLED; - - EXIT; -} -EXPORT_SYMBOL(lc_watchdog_disable); - -void lc_watchdog_delete(struct lc_watchdog *lcw) -{ - unsigned long flags; - ENTRY; - LASSERT(lcw != NULL); - - del_timer(&lcw->lcw_timer); - - lcw_update_time(lcw, "deleted"); - - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - if (!list_empty(&lcw->lcw_list)) - list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); - - down(&lcw_refcount_sem); - if (--lcw_refcount == 0) - lcw_dispatch_stop(); - up(&lcw_refcount_sem); - - PORTAL_FREE(lcw, sizeof(*lcw)); - - EXIT; -} -EXPORT_SYMBOL(lc_watchdog_delete); - -/* - * Provided watchdog handlers - */ - -extern void portals_debug_dumplog_internal(void *arg); - -void lc_watchdog_dumplog(struct lc_watchdog *lcw, - struct task_struct *tsk, - void *data) -{ - tsk = tsk ? tsk : current; - portals_debug_dumplog_internal((void *)(long)tsk->pid); -} -EXPORT_SYMBOL(lc_watchdog_dumplog); diff --git a/lustre/portals/packaging/.cvsignore b/lustre/portals/packaging/.cvsignore deleted file mode 100644 index fd1d56a..0000000 --- a/lustre/portals/packaging/.cvsignore +++ /dev/null @@ -1,8 +0,0 @@ -Makefile -Makefile.in -aclocal.m4 -config.log -config.status -config.cache -configure -portals.spec diff --git a/lustre/portals/packaging/Makefile.am b/lustre/portals/packaging/Makefile.am deleted file mode 100644 index 126bc69..0000000 --- a/lustre/portals/packaging/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (C) 2002 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -EXTRA_DIST = portals.spec \ No newline at end of file diff --git a/lustre/portals/packaging/portals.spec.in b/lustre/portals/packaging/portals.spec.in deleted file mode 100644 index e196b3f..0000000 --- a/lustre/portals/packaging/portals.spec.in +++ /dev/null @@ -1,116 +0,0 @@ -%define kversion @RELEASE@ -%define linuxdir @LINUX@ -%define version HEAD - -Summary: Sandia Portals Message Passing - utilities -Name: portals -Version: %{version} -Release: 0210101748uml -Copyright: LGPL -Group: Utilities/System -BuildRoot: /var/tmp/portals-%{version}-root -Source: http://sandiaportals.org/portals-%{version}.tar.gz - -%description -Sandia Portals message passing package. Contains kernel modules, libraries and utilities. - -%package -n portals-modules -Summary: Kernel modules and NAL's for portals -Group: Development/Kernel - -%description -n portals-modules -Object-Based Disk storage drivers for Linux %{kversion}. - -%package -n portals-source -Summary: Portals kernel source for rebuilding with other kernels -Group: Development/Kernel - -%description -n portals-source -Portals kernel source for rebuilding with other kernels - -%prep -%setup -n portals-%{version} - -%build -rm -rf $RPM_BUILD_ROOT - -# Create the pristine source directory. -srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version} -mkdir -p $srcdir -find . -name CVS -prune -o -print | cpio -ap $srcdir - -# Set an explicit path to our Linux tree, if we can. -conf_flag= -linuxdir=%{linuxdir} -test -d $linuxdir && conf_flag=--with-linux=$linuxdir -./configure $conf_flag -make - -%install -make install prefix=$RPM_BUILD_ROOT - -%ifarch alpha -# this hurts me - conf_flag= - linuxdir=%{linuxdir} - test -d $linuxdir && conf_flag=--with-linux=$linuxdir - make clean - ./configure --enable-rtscts-myrinet $conf_flag - make - cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o - cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload -%endif - - -%files -%attr(-, root, root) %doc COPYING -%attr(-, root, root) /usr/sbin/acceptor -%attr(-, root, root) /usr/sbin/ptlctl -%attr(-, root, root) /usr/sbin/debugctl -%ifarch alpha -%attr(-, root, root) /usr/sbin/mcpload -%endif -%attr(-, root, root) /lib/libmyrnal.a -%attr(-, root, root) /lib/libptlapi.a -%attr(-, root, root) /lib/libptlctl.a -%attr(-, root, root) /lib/libprocbridge.a -%attr(-, root, root) /lib/libptllib.a -%attr(-, root, root) /lib/libtcpnal.a -%attr(-, root, root) /lib/libtcpnalutil.a -%attr(-, root, root) /usr/include/portals/*.h -%attr(-, root, root) /usr/include/portals/base/*.h -%attr(-, root, root) /usr/include/linux/*.h - -%files -n portals-modules -%attr(-, root, root) %doc COPYING -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o -%ifarch alpha -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o -%endif -%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o - -%files -n portals-source -%attr(-, root, root) /usr/src/portals-%{version} - -%post -if [ ! -e /dev/portals ]; then - mknod /dev/portals c 10 240 -fi -depmod -ae || exit 0 - -grep -q portals /etc/modules.conf || \ - echo 'alias char-major-10-240 portals' >> /etc/modules.conf - -grep -q '/dev/portals' /etc/modules.conf || \ - echo 'alias /dev/portals portals' >> /etc/modules.conf - -%postun -depmod -ae || exit 0 - -%clean -#rm -rf $RPM_BUILD_ROOT - -# end of file diff --git a/lustre/portals/portals/.cvsignore b/lustre/portals/portals/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/portals/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/portals/Makefile.in b/lustre/portals/portals/Makefile.in deleted file mode 100644 index c0f2e71..0000000 --- a/lustre/portals/portals/Makefile.in +++ /dev/null @@ -1,6 +0,0 @@ -MODULES := portals -portals-objs := api-errno.o api-ni.o api-wrap.o -portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o -portals-objs += lib-move.o lib-ni.o lib-pid.o module.o - -@INCLUDE_RULES@ diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk deleted file mode 100644 index 088902a..0000000 --- a/lustre/portals/portals/Makefile.mk +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../Kernelenv - -obj-y += portals.o -portals-objs := lib-eq.o lib-init.o lib-md.o lib-me.o \ - lib-move.o lib-msg.o lib-ni.o lib-pid.o \ - api-errno.o api-ni.o api-wrap.o \ - module.o diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c deleted file mode 100644 index 9a4e5ac..0000000 --- a/lustre/portals/portals/api-errno.c +++ /dev/null @@ -1,48 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-errno.c - * Instantiate the string table of errors - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - */ - -/* If you change these, you must update the number table in portals/errno.h */ -const char *ptl_err_str[] = { - "PTL_OK", - "PTL_SEGV", - - "PTL_NO_SPACE", - "PTL_ME_IN_USE", - "PTL_VAL_FAILED", - - "PTL_NAL_FAILED", - "PTL_NO_INIT", - "PTL_IFACE_DUP", - "PTL_IFACE_INVALID", - - "PTL_HANDLE_INVALID", - "PTL_MD_INVALID", - "PTL_ME_INVALID", -/* If you change these, you must update the number table in portals/errno.h */ - "PTL_PROCESS_INVALID", - "PTL_PT_INDEX_INVALID", - - "PTL_SR_INDEX_INVALID", - "PTL_EQ_INVALID", - "PTL_EQ_DROPPED", - - "PTL_EQ_EMPTY", - "PTL_MD_NO_UPDATE", - "PTL_FAIL", - - "PTL_IOV_INVALID", - - "PTL_EQ_IN_USE", - - "PTL_NI_INVALID", - "PTL_MD_ILLEGAL", - - "PTL_MAX_ERRNO" -}; -/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c deleted file mode 100644 index 7e92256..0000000 --- a/lustre/portals/portals/api-ni.c +++ /dev/null @@ -1,265 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-ni.c - * Network Interface code - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int ptl_init; - -/* Put some magic in the NI handle so uninitialised/zeroed handles are easy - * to spot */ -#define NI_HANDLE_MAGIC 0xebc0de00 -#define NI_HANDLE_MASK 0x000000ff - -static struct nal_t *ptl_nal_table[NAL_MAX_NR + 1]; - -#ifdef __KERNEL__ -DECLARE_MUTEX(ptl_mutex); - -static void ptl_mutex_enter (void) -{ - down (&ptl_mutex); -} - -static void ptl_mutex_exit (void) -{ - up (&ptl_mutex); -} -#else -static void ptl_mutex_enter (void) -{ -} - -static void ptl_mutex_exit (void) -{ -} -#endif - -nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) -{ - unsigned int idx = handle->nal_idx; - - /* XXX we really rely on the caller NOT racing with interface - * setup/teardown. That ensures her NI handle can't get - * invalidated out from under her (or worse, swapped for a - * completely different interface!) */ - - LASSERT (ptl_init); - - if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) - return NULL; - - idx &= NI_HANDLE_MASK; - - if (idx > NAL_MAX_NR || - ptl_nal_table[idx] == NULL || - ptl_nal_table[idx]->nal_refct == 0) - return NULL; - - return ptl_nal_table[idx]; -} - -int ptl_register_nal (ptl_interface_t interface, nal_t *nal) -{ - int rc; - - ptl_mutex_enter(); - - if (interface < 0 || interface > NAL_MAX_NR) - rc = PTL_IFACE_INVALID; - else if (ptl_nal_table[interface] != NULL) - rc = PTL_IFACE_DUP; - else { - rc = PTL_OK; - ptl_nal_table[interface] = nal; - LASSERT(nal->nal_refct == 0); - } - - ptl_mutex_exit(); - return (rc); -} - -void ptl_unregister_nal (ptl_interface_t interface) -{ - LASSERT(interface >= 0 && interface <= NAL_MAX_NR); - LASSERT(ptl_nal_table[interface] != NULL); - LASSERT(ptl_nal_table[interface]->nal_refct == 0); - - ptl_mutex_enter(); - - ptl_nal_table[interface] = NULL; - - ptl_mutex_exit(); -} - -int PtlInit(int *max_interfaces) -{ - LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); - - /* If this assertion fails, we need more bits in NI_HANDLE_MASK and - * to shift NI_HANDLE_MAGIC left appropriately */ - LASSERT (NAL_MAX_NR < (NI_HANDLE_MASK + 1)); - - if (max_interfaces != NULL) - *max_interfaces = NAL_MAX_NR + 1; - - ptl_mutex_enter(); - - if (!ptl_init) { - /* NULL pointers, clear flags */ - memset(ptl_nal_table, 0, sizeof(ptl_nal_table)); -#ifndef __KERNEL__ - /* Kernel NALs register themselves when their module loads, - * and unregister themselves when their module is unloaded. - * Userspace NALs, are plugged in explicitly here... */ - { - extern nal_t procapi_nal; - - /* XXX pretend it's socknal to keep liblustre happy... */ - ptl_nal_table[SOCKNAL] = &procapi_nal; - LASSERT (procapi_nal.nal_refct == 0); - } -#endif - ptl_init = 1; - } - - ptl_mutex_exit(); - - return PTL_OK; -} - -void PtlFini(void) -{ - nal_t *nal; - int i; - - ptl_mutex_enter(); - - if (ptl_init) { - for (i = 0; i <= NAL_MAX_NR; i++) { - - nal = ptl_nal_table[i]; - if (nal == NULL) - continue; - - if (nal->nal_refct != 0) { - CWARN("NAL %x has outstanding refcount %d\n", - i, nal->nal_refct); - nal->nal_ni_fini(nal); - } - - ptl_nal_table[i] = NULL; - } - - ptl_init = 0; - } - - ptl_mutex_exit(); -} - -int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, - ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, - ptl_handle_ni_t *handle) -{ - nal_t *nal; - int i; - int rc; - - if (!ptl_init) - return PTL_NO_INIT; - - ptl_mutex_enter (); - - if (interface == PTL_IFACE_DEFAULT) { - for (i = 0; i <= NAL_MAX_NR; i++) - if (ptl_nal_table[i] != NULL) { - interface = i; - break; - } - /* NB if no interfaces are registered, 'interface' will - * fail the valid test below */ - } - - if (interface < 0 || - interface > NAL_MAX_NR || - ptl_nal_table[interface] == NULL) { - GOTO(out, rc = PTL_IFACE_INVALID); - } - - nal = ptl_nal_table[interface]; - nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; - nal->nal_handle.cookie = 0; - - CDEBUG(D_OTHER, "Starting up NAL (%x) refs %d\n", interface, nal->nal_refct); - rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits); - - if (rc != PTL_OK) { - CERROR("Error %d starting up NAL %x, refs %d\n", rc, - interface, nal->nal_refct); - GOTO(out, rc); - } - - if (nal->nal_refct != 0) { - /* Caller gets to know if this was the first ref or not */ - rc = PTL_IFACE_DUP; - } - - nal->nal_refct++; - *handle = nal->nal_handle; - - out: - ptl_mutex_exit (); - - return rc; -} - -int PtlNIFini(ptl_handle_ni_t ni) -{ - nal_t *nal; - int idx; - - if (!ptl_init) - return PTL_NO_INIT; - - ptl_mutex_enter (); - - nal = ptl_hndl2nal (&ni); - if (nal == NULL) { - ptl_mutex_exit (); - return PTL_HANDLE_INVALID; - } - - idx = ni.nal_idx & NI_HANDLE_MASK; - - LASSERT(nal->nal_refct > 0); - - nal->nal_refct--; - - /* nal_refct == 0 tells nal->shutdown to really shut down */ - nal->nal_ni_fini(nal); - - ptl_mutex_exit (); - return PTL_OK; -} diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c deleted file mode 100644 index 37f6c0b..0000000 --- a/lustre/portals/portals/api-wrap.c +++ /dev/null @@ -1,366 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-wrap.c - * User-level wrappers that dispatch across the protection boundaries - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -# define DEBUG_SUBSYSTEM S_PORTALS -#include - -void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) -{ - snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); -} - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out) -{ - if (!ptl_init) - return PTL_NO_INIT; - - if (ptl_hndl2nal(&handle_in) == NULL) - return PTL_HANDLE_INVALID; - - *ni_out = handle_in; - return PTL_OK; -} - -int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_handle); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_get_id(nal, id); -} - -int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_handle); - if (nal == NULL) - return PTL_NI_INVALID; - - /* We don't support different uids yet */ - *uid = 0; - return PTL_OK; -} - -int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_fail_nid(nal, nid, threshold); -} - -int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t *status_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_ni_status(nal, register_in, status_out); -} - -int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, - unsigned long *distance_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_ni_dist(nal, &process_in, distance_out); -} - -int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, - ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, - ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_me_attach(nal, index_in, match_id_in, - match_bits_in, ignore_bits_in, - unlink_in, pos_in, handle_out); -} - -int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, - ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, - ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, - ptl_handle_me_t * handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(¤t_in); - if (nal == NULL) - return PTL_ME_INVALID; - - return nal->nal_me_insert(nal, ¤t_in, match_id_in, - match_bits_in, ignore_bits_in, - unlink_in, position_in, handle_out); -} - -int PtlMEUnlink(ptl_handle_me_t current_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(¤t_in); - if (nal == NULL) - return PTL_ME_INVALID; - - return nal->nal_me_unlink(nal, ¤t_in); -} - -int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&me_in); - if (nal == NULL) - return PTL_ME_INVALID; - - if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && - ptl_hndl2nal(&md_in.eq_handle) != nal) - return PTL_MD_ILLEGAL; - - return (nal->nal_md_attach)(nal, &me_in, &md_in, - unlink_in, handle_out); -} - -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_in); - if (nal == NULL) - return PTL_NI_INVALID; - - if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && - ptl_hndl2nal(&md_in.eq_handle) != nal) - return PTL_MD_ILLEGAL; - - return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out); -} - -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, - ptl_md_t *new_inout, ptl_handle_eq_t testq_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) && - ptl_hndl2nal(&testq_in) != nal) - return PTL_EQ_INVALID; - - return (nal->nal_md_update)(nal, &md_in, - old_inout, new_inout, &testq_in); -} - -int PtlMDUnlink(ptl_handle_md_t md_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_md_unlink)(nal, &md_in); -} - -int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface); - if (nal == NULL) - return PTL_NI_INVALID; - - return (nal->nal_eq_alloc)(nal, count, callback, handle_out); -} - -int PtlEQFree(ptl_handle_eq_t eventq) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&eventq); - if (nal == NULL) - return PTL_EQ_INVALID; - - return (nal->nal_eq_free)(nal, &eventq); -} - -int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev) -{ - int which; - - return (PtlEQPoll (&eventq, 1, 0, ev, &which)); -} - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) -{ - int which; - - return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, - event_out, &which)); -} - -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out) -{ - int i; - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - if (neq_in < 1) - return PTL_EQ_INVALID; - - nal = ptl_hndl2nal(&eventqs_in[0]); - if (nal == NULL) - return PTL_EQ_INVALID; - - for (i = 1; i < neq_in; i++) - if (ptl_hndl2nal(&eventqs_in[i]) != nal) - return PTL_EQ_INVALID; - - return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout, - event_out, which_out); -} - - -int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, - ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in); -} - -int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, - ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, - ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_put)(nal, &md_in, ack_req_in, - &target_in, portal_in, ac_in, - match_bits_in, offset_in, hdr_data_in); -} - -int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t ac_in, - ptl_match_bits_t match_bits_in, ptl_size_t offset_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_get)(nal, &md_in, - &target_in, portal_in, ac_in, - match_bits_in, offset_in); -} - diff --git a/lustre/portals/portals/autoMakefile.am b/lustre/portals/portals/autoMakefile.am deleted file mode 100644 index 285f8fe..0000000 --- a/lustre/portals/portals/autoMakefile.am +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2002 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -my_sources = api-errno.c api-ni.c api-wrap.c \ - lib-init.c lib-me.c lib-msg.c lib-eq.c \ - lib-md.c lib-move.c lib-ni.c lib-pid.c - -if !CRAY_PORTALS - -if LIBLUSTRE -noinst_LIBRARIES= libportals.a -libportals_a_SOURCES= $(my_sources) -libportals_a_CPPFLAGS = $(LLCPPFLAGS) -libportals_a_CFLAGS = $(LLCFLAGS) -endif - -if MODULES -modulenet_DATA = portals$(KMODEXT) -endif # MODULES - -endif # CRAY_PORTALS - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(portals-objs:%.o=%.c) diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c deleted file mode 100644 index 8ea6fdd..0000000 --- a/lustre/portals/portals/lib-eq.c +++ /dev/null @@ -1,265 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-eq.c - * Library level Event queue management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int -lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle) -{ - lib_nal_t *nal = apinal->nal_data; - lib_eq_t *eq; - unsigned long flags; - int rc; - - /* We need count to be a power of 2 so that when eq_{enq,deq}_seq - * overflow, they don't skip entries, so the queue has the same - * apparant capacity at all times */ - - if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ - do { /* knock off all but the top bit... */ - count &= ~LOWEST_BIT_SET (count); - } while (count != LOWEST_BIT_SET(count)); - - count <<= 1; /* ...and round up */ - } - - if (count == 0) /* catch bad parameter / overflow on roundup */ - return (PTL_VAL_FAILED); - - eq = lib_eq_alloc (nal); - if (eq == NULL) - return (PTL_NO_SPACE); - - PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t)); - if (eq->eq_events == NULL) { - LIB_LOCK(nal, flags); - lib_eq_free (nal, eq); - LIB_UNLOCK(nal, flags); - } - - if (nal->libnal_map != NULL) { - struct iovec iov = { - .iov_base = eq->eq_events, - .iov_len = count * sizeof(ptl_event_t)}; - - rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey); - if (rc != PTL_OK) { - LIB_LOCK(nal, flags); - lib_eq_free (nal, eq); - LIB_UNLOCK(nal, flags); - return (rc); - } - } - - /* NB this resets all event sequence numbers to 0, to be earlier - * than eq_deq_seq */ - memset(eq->eq_events, 0, count * sizeof(ptl_event_t)); - - eq->eq_deq_seq = 1; - eq->eq_enq_seq = 1; - eq->eq_size = count; - eq->eq_refcount = 0; - eq->eq_callback = callback; - - LIB_LOCK(nal, flags); - - lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); - list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs); - - LIB_UNLOCK(nal, flags); - - ptl_eq2handle(handle, nal, eq); - return (PTL_OK); -} - -int -lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh) -{ - lib_nal_t *nal = apinal->nal_data; - lib_eq_t *eq; - int size; - ptl_event_t *events; - void *addrkey; - unsigned long flags; - - LIB_LOCK(nal, flags); - - eq = ptl_handle2eq(eqh, nal); - if (eq == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_EQ_INVALID); - } - - if (eq->eq_refcount != 0) { - LIB_UNLOCK(nal, flags); - return (PTL_EQ_IN_USE); - } - - /* stash for free after lock dropped */ - events = eq->eq_events; - size = eq->eq_size; - addrkey = eq->eq_addrkey; - - lib_invalidate_handle (nal, &eq->eq_lh); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - - LIB_UNLOCK(nal, flags); - - if (nal->libnal_unmap != NULL) { - struct iovec iov = { - .iov_base = events, - .iov_len = size * sizeof(ptl_event_t)}; - - nal->libnal_unmap(nal, 1, &iov, &addrkey); - } - - PORTAL_FREE(events, size * sizeof (ptl_event_t)); - - return (PTL_OK); -} - -int -lib_get_event (lib_eq_t *eq, ptl_event_t *ev) -{ - int new_index = eq->eq_deq_seq & (eq->eq_size - 1); - ptl_event_t *new_event = &eq->eq_events[new_index]; - int rc; - ENTRY; - - CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", - new_event, eq->eq_deq_seq, eq->eq_size); - - if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { - RETURN(PTL_EQ_EMPTY); - } - - /* We've got a new event... */ - *ev = *new_event; - - /* ...but did it overwrite an event we've not seen yet? */ - if (eq->eq_deq_seq == new_event->sequence) { - rc = PTL_OK; - } else { - CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n", - eq->eq_deq_seq, new_event->sequence); - rc = PTL_EQ_DROPPED; - } - - eq->eq_deq_seq = new_event->sequence + 1; - RETURN(rc); -} - - -int -lib_api_eq_poll (nal_t *apinal, - ptl_handle_eq_t *eventqs, int neq, int timeout_ms, - ptl_event_t *event, int *which) -{ - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - unsigned long flags; - int i; - int rc; -#ifdef __KERNEL__ - wait_queue_t wq; - unsigned long now; -#else - struct timeval then; - struct timeval now; - struct timespec ts; -#endif - ENTRY; - - LIB_LOCK(nal, flags); - - for (;;) { - for (i = 0; i < neq; i++) { - lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal); - - rc = lib_get_event (eq, event); - if (rc != PTL_EQ_EMPTY) { - LIB_UNLOCK(nal, flags); - *which = i; - RETURN(rc); - } - } - - if (timeout_ms == 0) { - LIB_UNLOCK (nal, flags); - RETURN (PTL_EQ_EMPTY); - } - - /* Some architectures force us to do spin locking/unlocking - * in the same stack frame, means we can abstract the - * locking here */ -#ifdef __KERNEL__ - init_waitqueue_entry(&wq, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ni->ni_waitq, &wq); - - LIB_UNLOCK(nal, flags); - - if (timeout_ms < 0) { - schedule (); - } else { - now = jiffies; - schedule_timeout((timeout_ms * HZ)/1000); - timeout_ms -= ((jiffies - now) * 1000)/HZ; - if (timeout_ms < 0) - timeout_ms = 0; - } - - LIB_LOCK(nal, flags); -#else - if (timeout_ms < 0) { - pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex); - } else { - gettimeofday(&then, NULL); - - ts.tv_sec = then.tv_sec + timeout_ms/1000; - ts.tv_nsec = then.tv_usec * 1000 + - (timeout_ms%1000) * 1000000; - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; - } - - pthread_cond_timedwait(&ni->ni_cond, - &ni->ni_mutex, &ts); - - gettimeofday(&now, NULL); - timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + - (now.tv_usec - then.tv_usec) / 1000; - - if (timeout_ms < 0) - timeout_ms = 0; - } -#endif - } -} diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c deleted file mode 100644 index 9d97bc1..0000000 --- a/lustre/portals/portals/lib-init.c +++ /dev/null @@ -1,434 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-init.c - * Start up the internal library and clear all structures - * Called by the NAL when it initializes. Safe to call multiple times. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -# define DEBUG_SUBSYSTEM S_PORTALS -#include - -#ifdef __KERNEL__ -# include /* for memset() */ -# include -# ifdef KERNEL_ADDR_CACHE -# include -# endif -#else -# include -# include -#endif - -#ifndef PTL_USE_LIB_FREELIST - -int -kportal_descriptor_setup (lib_nal_t *nal, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - /* Ignore requested limits! */ - actual_limits->max_mes = INT_MAX; - actual_limits->max_mds = INT_MAX; - actual_limits->max_eqs = INT_MAX; - - return PTL_OK; -} - -void -kportal_descriptor_cleanup (lib_nal_t *nal) -{ -} -#else - -int -lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size) -{ - char *space; - - LASSERT (n > 0); - - size += offsetof (lib_freeobj_t, fo_contents); - - PORTAL_ALLOC(space, n * size); - if (space == NULL) - return (PTL_NO_SPACE); - - INIT_LIST_HEAD (&fl->fl_list); - fl->fl_objs = space; - fl->fl_nobjs = n; - fl->fl_objsize = size; - - do - { - memset (space, 0, size); - list_add ((struct list_head *)space, &fl->fl_list); - space += size; - } while (--n != 0); - - return (PTL_OK); -} - -void -lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl) -{ - struct list_head *el; - int count; - - if (fl->fl_nobjs == 0) - return; - - count = 0; - for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) - count++; - - LASSERT (count == fl->fl_nobjs); - - PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); - memset (fl, 0, sizeof (fl)); -} - -int -kportal_descriptor_setup (lib_nal_t *nal, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - /* NB on failure caller must still call kportal_descriptor_cleanup */ - /* ****** */ - lib_ni_t *ni = &nal->libnal_ni; - int rc; - - memset (&ni->ni_free_mes, 0, sizeof (ni->ni_free_mes)); - memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs)); - memset (&ni->ni_free_mds, 0, sizeof (ni->ni_free_mds)); - memset (&ni->ni_free_eqs, 0, sizeof (ni->ni_free_eqs)); - - /* Ignore requested limits! */ - actual_limits->max_mes = MAX_MES; - actual_limits->max_mds = MAX_MDS; - actual_limits->max_eqs = MAX_EQS; - /* Hahahah what a load of bollocks. There's nowhere to - * specify the max # messages in-flight */ - - rc = lib_freelist_init (nal, &ni->ni_free_mes, - MAX_MES, sizeof (lib_me_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_msgs, - MAX_MSGS, sizeof (lib_msg_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_mds, - MAX_MDS, sizeof (lib_md_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_eqs, - MAX_EQS, sizeof (lib_eq_t)); - return (rc); -} - -void -kportal_descriptor_cleanup (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - - lib_freelist_fini (nal, &ni->ni_free_mes); - lib_freelist_fini (nal, &ni->ni_free_msgs); - lib_freelist_fini (nal, &ni->ni_free_mds); - lib_freelist_fini (nal, &ni->ni_free_eqs); -} - -#endif - -__u64 -lib_create_interface_cookie (lib_nal_t *nal) -{ - /* NB the interface cookie in wire handles guards against delayed - * replies and ACKs appearing valid in a new instance of the same - * interface. Initialisation time, even if it's only implemented - * to millisecond resolution is probably easily good enough. */ - struct timeval tv; - __u64 cookie; -#ifndef __KERNEL__ - int rc = gettimeofday (&tv, NULL); - LASSERT (rc == 0); -#else - do_gettimeofday(&tv); -#endif - cookie = tv.tv_sec; - cookie *= 1000000; - cookie += tv.tv_usec; - return (cookie); -} - -int -lib_setup_handle_hash (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - int i; - - /* Arbitrary choice of hash table size */ -#ifdef __KERNEL__ - ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); -#else - ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; -#endif - PORTAL_ALLOC(ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); - if (ni->ni_lh_hash_table == NULL) - return (PTL_NO_SPACE); - - for (i = 0; i < ni->ni_lh_hash_size; i++) - INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); - - ni->ni_next_object_cookie = PTL_COOKIE_TYPES; - - return (PTL_OK); -} - -void -lib_cleanup_handle_hash (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - - if (ni->ni_lh_hash_table == NULL) - return; - - PORTAL_FREE(ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); -} - -lib_handle_t * -lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) -{ - /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->libnal_ni; - struct list_head *list; - struct list_head *el; - unsigned int hash; - - if ((cookie & (PTL_COOKIE_TYPES - 1)) != type) - return (NULL); - - hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; - list = &ni->ni_lh_hash_table[hash]; - - list_for_each (el, list) { - lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); - - if (lh->lh_cookie == cookie) - return (lh); - } - - return (NULL); -} - -void -lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) -{ - /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->libnal_ni; - unsigned int hash; - - LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); - lh->lh_cookie = ni->ni_next_object_cookie | type; - ni->ni_next_object_cookie += PTL_COOKIE_TYPES; - - hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; - list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); -} - -void -lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh) -{ - list_del (&lh->lh_hash_chain); -} - -int -lib_init(lib_nal_t *libnal, nal_t *apinal, - ptl_process_id_t process_id, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - int rc = PTL_OK; - lib_ni_t *ni = &libnal->libnal_ni; - int ptl_size; - int i; - ENTRY; - - /* NB serialised in PtlNIInit() */ - - lib_assert_wire_constants (); - - /* Setup the API nal with the lib API handling functions */ - apinal->nal_get_id = lib_api_get_id; - apinal->nal_ni_status = lib_api_ni_status; - apinal->nal_ni_dist = lib_api_ni_dist; - apinal->nal_fail_nid = lib_api_fail_nid; - apinal->nal_me_attach = lib_api_me_attach; - apinal->nal_me_insert = lib_api_me_insert; - apinal->nal_me_unlink = lib_api_me_unlink; - apinal->nal_md_attach = lib_api_md_attach; - apinal->nal_md_bind = lib_api_md_bind; - apinal->nal_md_unlink = lib_api_md_unlink; - apinal->nal_md_update = lib_api_md_update; - apinal->nal_eq_alloc = lib_api_eq_alloc; - apinal->nal_eq_free = lib_api_eq_free; - apinal->nal_eq_poll = lib_api_eq_poll; - apinal->nal_put = lib_api_put; - apinal->nal_get = lib_api_get; - - apinal->nal_data = libnal; - ni->ni_api = apinal; - - rc = kportal_descriptor_setup (libnal, requested_limits, - &ni->ni_actual_limits); - if (rc != PTL_OK) - goto out; - - memset(&ni->ni_counters, 0, sizeof(lib_counters_t)); - - INIT_LIST_HEAD (&ni->ni_active_msgs); - INIT_LIST_HEAD (&ni->ni_active_mds); - INIT_LIST_HEAD (&ni->ni_active_eqs); - INIT_LIST_HEAD (&ni->ni_test_peers); - -#ifdef __KERNEL__ - spin_lock_init (&ni->ni_lock); - init_waitqueue_head (&ni->ni_waitq); -#else - pthread_mutex_init(&ni->ni_mutex, NULL); - pthread_cond_init(&ni->ni_cond, NULL); -#endif - - ni->ni_interface_cookie = lib_create_interface_cookie (libnal); - ni->ni_next_object_cookie = 0; - rc = lib_setup_handle_hash (libnal); - if (rc != PTL_OK) - goto out; - - ni->ni_pid = process_id; - - if (requested_limits != NULL) - ptl_size = requested_limits->max_pt_index + 1; - else - ptl_size = 64; - - ni->ni_portals.size = ptl_size; - PORTAL_ALLOC(ni->ni_portals.tbl, - ptl_size * sizeof(struct list_head)); - if (ni->ni_portals.tbl == NULL) { - rc = PTL_NO_SPACE; - goto out; - } - - for (i = 0; i < ptl_size; i++) - INIT_LIST_HEAD(&(ni->ni_portals.tbl[i])); - - /* max_{mes,mds,eqs} set in kportal_descriptor_setup */ - - /* We don't have an access control table! */ - ni->ni_actual_limits.max_ac_index = -1; - - ni->ni_actual_limits.max_pt_index = ptl_size - 1; - ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; - ni->ni_actual_limits.max_me_list = INT_MAX; - - /* We don't support PtlGetPut! */ - ni->ni_actual_limits.max_getput_md = 0; - - if (actual_limits != NULL) - *actual_limits = ni->ni_actual_limits; - - out: - if (rc != PTL_OK) { - lib_cleanup_handle_hash (libnal); - kportal_descriptor_cleanup (libnal); - } - - RETURN (rc); -} - -int -lib_fini(lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - int idx; - - /* NB no state_lock() since this is the last reference. The NAL - * should have shut down already, so it should be safe to unlink - * and free all descriptors, even those that appear committed to a - * network op (eg MD with non-zero pending count) - */ - - for (idx = 0; idx < ni->ni_portals.size; idx++) - while (!list_empty (&ni->ni_portals.tbl[idx])) { - lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next, - lib_me_t, me_list); - - CERROR ("Active me %p on exit\n", me); - list_del (&me->me_list); - lib_me_free (nal, me); - } - - while (!list_empty (&ni->ni_active_mds)) { - lib_md_t *md = list_entry (ni->ni_active_mds.next, - lib_md_t, md_list); - - CERROR ("Active md %p on exit\n", md); - list_del (&md->md_list); - lib_md_free (nal, md); - } - - while (!list_empty (&ni->ni_active_eqs)) { - lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, - lib_eq_t, eq_list); - - CERROR ("Active eq %p on exit\n", eq); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - } - - while (!list_empty (&ni->ni_active_msgs)) { - lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, - lib_msg_t, msg_list); - - CERROR ("Active msg %p on exit\n", msg); - list_del (&msg->msg_list); - lib_msg_free (nal, msg); - } - - PORTAL_FREE(ni->ni_portals.tbl, - ni->ni_portals.size * sizeof(struct list_head)); - - lib_cleanup_handle_hash (nal); - kportal_descriptor_cleanup (nal); - -#ifndef __KERNEL__ - pthread_mutex_destroy(&ni->ni_mutex); - pthread_cond_destroy(&ni->ni_cond); -#endif - - return (PTL_OK); -} diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c deleted file mode 100644 index 6deadb8..0000000 --- a/lustre/portals/portals/lib-md.c +++ /dev/null @@ -1,426 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-md.c - * Memory Descriptor management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef __KERNEL__ -# include -#else -# define DEBUG_SUBSYSTEM S_PORTALS -# include -#endif - -#include - -/* must be called with state lock held */ -void -lib_md_unlink(lib_nal_t *nal, lib_md_t *md) -{ - if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) { - /* first unlink attempt... */ - lib_me_t *me = md->me; - - md->md_flags |= PTL_MD_FLAG_ZOMBIE; - - /* Disassociate from ME (if any), and unlink it if it was created - * with PTL_UNLINK */ - if (me != NULL) { - me->md = NULL; - if (me->unlink == PTL_UNLINK) - lib_me_unlink(nal, me); - } - - /* emsure all future handle lookups fail */ - lib_invalidate_handle(nal, &md->md_lh); - } - - if (md->pending != 0) { - CDEBUG(D_NET, "Queueing unlink of md %p\n", md); - return; - } - - CDEBUG(D_NET, "Unlinking md %p\n", md); - - if ((md->options & PTL_MD_KIOV) != 0) { - if (nal->libnal_unmap_pages != NULL) - nal->libnal_unmap_pages (nal, - md->md_niov, - md->md_iov.kiov, - &md->md_addrkey); - } else if (nal->libnal_unmap != NULL) { - nal->libnal_unmap (nal, - md->md_niov, md->md_iov.iov, - &md->md_addrkey); - } - - if (md->eq != NULL) { - md->eq->eq_refcount--; - LASSERT (md->eq->eq_refcount >= 0); - } - - list_del (&md->md_list); - lib_md_free(nal, md); -} - -/* must be called with state lock held */ -static int -lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) -{ - lib_eq_t *eq = NULL; - int rc; - int i; - int niov; - int total_length = 0; - - /* NB we are passed an allocated, but uninitialised/active md. - * if we return success, caller may lib_md_unlink() it. - * otherwise caller may only lib_md_free() it. - */ - - if (!PtlHandleIsEqual (umd->eq_handle, PTL_EQ_NONE)) { - eq = ptl_handle2eq(&umd->eq_handle, nal); - if (eq == NULL) - return PTL_EQ_INVALID; - } - - /* This implementation doesn't know how to create START events or - * disable END events. Best to LASSERT our caller is compliant so - * we find out quickly... */ - LASSERT (eq == NULL || - ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 && - (umd->options & PTL_MD_EVENT_END_DISABLE) == 0)); - - lmd->me = NULL; - lmd->start = umd->start; - lmd->offset = 0; - lmd->max_size = umd->max_size; - lmd->options = umd->options; - lmd->user_ptr = umd->user_ptr; - lmd->eq = eq; - lmd->threshold = umd->threshold; - lmd->pending = 0; - lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; - - if ((umd->options & PTL_MD_IOVEC) != 0) { - - if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */ - return PTL_MD_ILLEGAL; - - lmd->md_niov = niov = umd->length; - memcpy(lmd->md_iov.iov, umd->start, - niov * sizeof (lmd->md_iov.iov[0])); - - for (i = 0; i < niov; i++) { - /* We take the base address on trust */ - if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ - return PTL_MD_ILLEGAL; - - total_length += lmd->md_iov.iov[i].iov_len; - } - - lmd->length = total_length; - - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ - (umd->max_size < 0 || - umd->max_size > total_length)) // illegal max_size - return PTL_MD_ILLEGAL; - - if (nal->libnal_map != NULL) { - rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } - } else if ((umd->options & PTL_MD_KIOV) != 0) { -#ifndef __KERNEL__ - return PTL_MD_ILLEGAL; -#else - /* Trap attempt to use paged I/O if unsupported early. */ - if (nal->libnal_send_pages == NULL || - nal->libnal_recv_pages == NULL) - return PTL_MD_INVALID; - - lmd->md_niov = niov = umd->length; - memcpy(lmd->md_iov.kiov, umd->start, - niov * sizeof (lmd->md_iov.kiov[0])); - - for (i = 0; i < niov; i++) { - /* We take the page pointer on trust */ - if (lmd->md_iov.kiov[i].kiov_offset + - lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE ) - return PTL_VAL_FAILED; /* invalid length */ - - total_length += lmd->md_iov.kiov[i].kiov_len; - } - - lmd->length = total_length; - - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ - (umd->max_size < 0 || - umd->max_size > total_length)) // illegal max_size - return PTL_MD_ILLEGAL; - - if (nal->libnal_map_pages != NULL) { - rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } -#endif - } else { /* contiguous */ - lmd->length = umd->length; - lmd->md_niov = niov = 1; - lmd->md_iov.iov[0].iov_base = umd->start; - lmd->md_iov.iov[0].iov_len = umd->length; - - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ - (umd->max_size < 0 || - umd->max_size > umd->length)) // illegal max_size - return PTL_MD_ILLEGAL; - - if (nal->libnal_map != NULL) { - rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } - } - - if (eq != NULL) - eq->eq_refcount++; - - /* It's good; let handle2md succeed and add to active mds */ - lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD); - list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds); - - return PTL_OK; -} - -/* must be called with state lock held */ -void -lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd) -{ - /* NB this doesn't copy out all the iov entries so when a - * discontiguous MD is copied out, the target gets to know the - * original iov pointer (in start) and the number of entries it had - * and that's all. - */ - umd->start = lmd->start; - umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? - lmd->length : lmd->md_niov; - umd->threshold = lmd->threshold; - umd->max_size = lmd->max_size; - umd->options = lmd->options; - umd->user_ptr = lmd->user_ptr; - ptl_eq2handle(&umd->eq_handle, nal, lmd->eq); -} - -int -lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle) -{ - lib_nal_t *nal = apinal->nal_data; - lib_me_t *me; - lib_md_t *md; - unsigned long flags; - int rc; - - if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - umd->length > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_INVALID; - - md = lib_md_alloc(nal, umd); - if (md == NULL) - return PTL_NO_SPACE; - - LIB_LOCK(nal, flags); - - me = ptl_handle2me(meh, nal); - if (me == NULL) { - rc = PTL_ME_INVALID; - } else if (me->md != NULL) { - rc = PTL_ME_IN_USE; - } else { - rc = lib_md_build(nal, md, umd, unlink); - if (rc == PTL_OK) { - me->md = md; - md->me = me; - - ptl_md2handle(handle, nal, md); - - LIB_UNLOCK(nal, flags); - return (PTL_OK); - } - } - - lib_md_free (nal, md); - - LIB_UNLOCK(nal, flags); - return (rc); -} - -int -lib_api_md_bind(nal_t *apinal, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle) -{ - lib_nal_t *nal = apinal->nal_data; - lib_md_t *md; - unsigned long flags; - int rc; - - if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - umd->length > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_INVALID; - - md = lib_md_alloc(nal, umd); - if (md == NULL) - return PTL_NO_SPACE; - - LIB_LOCK(nal, flags); - - rc = lib_md_build(nal, md, umd, unlink); - - if (rc == PTL_OK) { - ptl_md2handle(handle, nal, md); - - LIB_UNLOCK(nal, flags); - return (PTL_OK); - } - - lib_md_free (nal, md); - - LIB_UNLOCK(nal, flags); - return (rc); -} - -int -lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh) -{ - lib_nal_t *nal = apinal->nal_data; - ptl_event_t ev; - lib_md_t *md; - unsigned long flags; - - LIB_LOCK(nal, flags); - - md = ptl_handle2md(mdh, nal); - if (md == NULL) { - LIB_UNLOCK(nal, flags); - return PTL_MD_INVALID; - } - - /* If the MD is busy, lib_md_unlink just marks it for deletion, and - * when the NAL is done, the completion event flags that the MD was - * unlinked. Otherwise, we enqueue an event now... */ - - if (md->eq != NULL && - md->pending == 0) { - memset(&ev, 0, sizeof(ev)); - - ev.type = PTL_EVENT_UNLINK; - ev.ni_fail_type = PTL_OK; - ev.unlinked = 1; - lib_md_deconstruct(nal, md, &ev.md); - ptl_md2handle(&ev.md_handle, nal, md); - - lib_enq_event_locked(nal, NULL, md->eq, &ev); - } - - lib_md_unlink(nal, md); - - LIB_UNLOCK(nal, flags); - return PTL_OK; -} - -int -lib_api_md_update (nal_t *apinal, - ptl_handle_md_t *mdh, - ptl_md_t *oldumd, ptl_md_t *newumd, - ptl_handle_eq_t *testqh) -{ - lib_nal_t *nal = apinal->nal_data; - lib_md_t *md; - lib_eq_t *test_eq = NULL; - unsigned long flags; - int rc; - - LIB_LOCK(nal, flags); - - md = ptl_handle2md(mdh, nal); - if (md == NULL) { - rc = PTL_MD_INVALID; - goto out; - } - - if (oldumd != NULL) - lib_md_deconstruct(nal, md, oldumd); - - if (newumd == NULL) { - rc = PTL_OK; - goto out; - } - - /* XXX fttb, the new MD must be the same "shape" wrt fragmentation, - * since we simply overwrite the old lib-md */ - if ((((newumd->options ^ md->options) & - (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) || - ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && - newumd->length != md->md_niov)) { - rc = PTL_IOV_INVALID; - goto out; - } - - if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) { - test_eq = ptl_handle2eq(testqh, nal); - if (test_eq == NULL) { - rc = PTL_EQ_INVALID; - goto out; - } - } - - if (md->pending != 0) { - rc = PTL_MD_NO_UPDATE; - goto out; - } - - if (test_eq == NULL || - test_eq->eq_deq_seq == test_eq->eq_enq_seq) { - lib_me_t *me = md->me; - int unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ? - PTL_UNLINK : PTL_RETAIN; - - // #warning this does not track eq refcounts properly - rc = lib_md_build(nal, md, newumd, unlink); - - md->me = me; - } else { - rc = PTL_MD_NO_UPDATE; - } - - out: - LIB_UNLOCK(nal, flags); - - return rc; -} diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c deleted file mode 100644 index 9665b4f..0000000 --- a/lustre/portals/portals/lib-me.c +++ /dev/null @@ -1,185 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-me.c - * Match Entry management routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef __KERNEL__ -# include -#else -# define DEBUG_SUBSYSTEM S_PORTALS -# include -#endif - -#include - -int -lib_api_me_attach(nal_t *apinal, - ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle) -{ - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_ptl_t *tbl = &ni->ni_portals; - lib_me_t *me; - unsigned long flags; - - if (portal >= tbl->size) - return PTL_PT_INDEX_INVALID; - - /* Should check for valid matchid, but not yet */ - - me = lib_me_alloc (nal); - if (me == NULL) - return PTL_NO_SPACE; - - LIB_LOCK(nal, flags); - - me->match_id = match_id; - me->match_bits = match_bits; - me->ignore_bits = ignore_bits; - me->unlink = unlink; - me->md = NULL; - - lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); - - if (pos == PTL_INS_AFTER) - list_add_tail(&me->me_list, &(tbl->tbl[portal])); - else - list_add(&me->me_list, &(tbl->tbl[portal])); - - ptl_me2handle(handle, nal, me); - - LIB_UNLOCK(nal, flags); - - return PTL_OK; -} - -int -lib_api_me_insert(nal_t *apinal, - ptl_handle_me_t *current_meh, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle) -{ - lib_nal_t *nal = apinal->nal_data; - lib_me_t *current_me; - lib_me_t *new_me; - unsigned long flags; - - new_me = lib_me_alloc (nal); - if (new_me == NULL) - return PTL_NO_SPACE; - - /* Should check for valid matchid, but not yet */ - - LIB_LOCK(nal, flags); - - current_me = ptl_handle2me(current_meh, nal); - if (current_me == NULL) { - lib_me_free (nal, new_me); - - LIB_UNLOCK(nal, flags); - return PTL_ME_INVALID; - } - - new_me->match_id = match_id; - new_me->match_bits = match_bits; - new_me->ignore_bits = ignore_bits; - new_me->unlink = unlink; - new_me->md = NULL; - - lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME); - - if (pos == PTL_INS_AFTER) - list_add_tail(&new_me->me_list, ¤t_me->me_list); - else - list_add(&new_me->me_list, ¤t_me->me_list); - - ptl_me2handle(handle, nal, new_me); - - LIB_UNLOCK(nal, flags); - - return PTL_OK; -} - -int -lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh) -{ - lib_nal_t *nal = apinal->nal_data; - unsigned long flags; - lib_me_t *me; - int rc; - - LIB_LOCK(nal, flags); - - me = ptl_handle2me(meh, nal); - if (me == NULL) { - rc = PTL_ME_INVALID; - } else { - lib_me_unlink(nal, me); - rc = PTL_OK; - } - - LIB_UNLOCK(nal, flags); - - return (rc); -} - -/* call with state_lock please */ -void -lib_me_unlink(lib_nal_t *nal, lib_me_t *me) -{ - list_del (&me->me_list); - - if (me->md) { - me->md->me = NULL; - lib_md_unlink(nal, me->md); - } - - lib_invalidate_handle (nal, &me->me_lh); - lib_me_free(nal, me); -} - -#if 0 -static void -lib_me_dump(lib_nal_t *nal, lib_me_t * me) -{ - CWARN("Match Entry %p ("LPX64")\n", me, - me->me_lh.lh_cookie); - - CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", - me->match_bits, me->ignore_bits); - - CWARN("\tMD\t= %p\n", me->md); - CWARN("\tprev\t= %p\n", - list_entry(me->me_list.prev, lib_me_t, me_list)); - CWARN("\tnext\t= %p\n", - list_entry(me->me_list.next, lib_me_t, me_list)); -} -#endif diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c deleted file mode 100644 index d584f1c..0000000 --- a/lustre/portals/portals/lib-move.c +++ /dev/null @@ -1,1427 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-move.c - * Data movement routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef __KERNEL__ -# include -#else -# define DEBUG_SUBSYSTEM S_PORTALS -# include -#endif -#include -#include - -/* forward ref */ -static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg); - -static lib_md_t * -lib_match_md(lib_nal_t *nal, int index, int op_mask, - ptl_nid_t src_nid, ptl_pid_t src_pid, - ptl_size_t rlength, ptl_size_t roffset, - ptl_match_bits_t match_bits, lib_msg_t *msg, - ptl_size_t *mlength_out, ptl_size_t *offset_out) -{ - lib_ni_t *ni = &nal->libnal_ni; - struct list_head *match_list = &ni->ni_portals.tbl[index]; - struct list_head *tmp; - lib_me_t *me; - lib_md_t *md; - ptl_size_t mlength; - ptl_size_t offset; - ENTRY; - - CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " - "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); - - if (index < 0 || index >= ni->ni_portals.size) { - CERROR("Invalid portal %d not in [0-%d]\n", - index, ni->ni_portals.size); - goto failed; - } - - list_for_each (tmp, match_list) { - me = list_entry(tmp, lib_me_t, me_list); - md = me->md; - - /* ME attached but MD not attached yet */ - if (md == NULL) - continue; - - LASSERT (me == md->me); - - /* mismatched MD op */ - if ((md->options & op_mask) == 0) - continue; - - /* MD exhausted */ - if (lib_md_exhausted(md)) - continue; - - /* mismatched ME nid/pid? */ - if (me->match_id.nid != PTL_NID_ANY && - me->match_id.nid != src_nid) - continue; - - CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n", - me->match_id.pid, src_pid); - - if (me->match_id.pid != PTL_PID_ANY && - me->match_id.pid != src_pid) - continue; - - /* mismatched ME matchbits? */ - if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) - continue; - - /* Hurrah! This _is_ a match; check it out... */ - - if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) - offset = md->offset; - else - offset = roffset; - - if ((md->options & PTL_MD_MAX_SIZE) != 0) { - mlength = md->max_size; - LASSERT (md->offset + mlength <= md->length); - } else { - mlength = md->length - offset; - } - - if (rlength <= mlength) { /* fits in allowed space */ - mlength = rlength; - } else if ((md->options & PTL_MD_TRUNCATE) == 0) { - /* this packet _really_ is too big */ - CERROR("Matching packet %d too big: %d left, " - "%d allowed\n", rlength, md->length - offset, - mlength); - goto failed; - } - - /* Commit to this ME/MD */ - CDEBUG(D_NET, "Incoming %s index %x from "LPU64"/%u of " - "length %d/%d into md "LPX64" [%d] + %d\n", - (op_mask == PTL_MD_OP_PUT) ? "put" : "get", - index, src_nid, src_pid, mlength, rlength, - md->md_lh.lh_cookie, md->md_niov, offset); - - lib_commit_md(nal, md, msg); - md->offset = offset + mlength; - - /* NB Caller sets ev.type and ev.hdr_data */ - msg->ev.initiator.nid = src_nid; - msg->ev.initiator.pid = src_pid; - msg->ev.pt_index = index; - msg->ev.match_bits = match_bits; - msg->ev.rlength = rlength; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - *offset_out = offset; - *mlength_out = mlength; - - /* Auto-unlink NOW, so the ME gets unlinked if required. - * We bumped md->pending above so the MD just gets flagged - * for unlink when it is finalized. */ - if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) != 0 && - lib_md_exhausted(md)) - lib_md_unlink(nal, md); - - RETURN (md); - } - - failed: - CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 - " offset %d length %d: no match\n", - ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", - src_nid, src_pid, index, match_bits, roffset, rlength); - RETURN(NULL); -} - -int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) -{ - lib_nal_t *nal = apinal->nal_data; - lib_test_peer_t *tp; - unsigned long flags; - struct list_head *el; - struct list_head *next; - struct list_head cull; - - if (threshold != 0) { - /* Adding a new entry */ - PORTAL_ALLOC(tp, sizeof(*tp)); - if (tp == NULL) - return PTL_NO_SPACE; - - tp->tp_nid = nid; - tp->tp_threshold = threshold; - - LIB_LOCK(nal, flags); - list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers); - LIB_UNLOCK(nal, flags); - return PTL_OK; - } - - /* removing entries */ - INIT_LIST_HEAD (&cull); - - LIB_LOCK(nal, flags); - - list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { - tp = list_entry (el, lib_test_peer_t, tp_list); - - if (tp->tp_threshold == 0 || /* needs culling anyway */ - nid == PTL_NID_ANY || /* removing all entries */ - tp->tp_nid == nid) /* matched this one */ - { - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); - } - } - - LIB_UNLOCK(nal, flags); - - while (!list_empty (&cull)) { - tp = list_entry (cull.next, lib_test_peer_t, tp_list); - - list_del (&tp->tp_list); - PORTAL_FREE(tp, sizeof (*tp)); - } - return PTL_OK; -} - -static int -fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) -{ - lib_test_peer_t *tp; - struct list_head *el; - struct list_head *next; - unsigned long flags; - struct list_head cull; - int fail = 0; - - INIT_LIST_HEAD (&cull); - - LIB_LOCK (nal, flags); - - list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { - tp = list_entry (el, lib_test_peer_t, tp_list); - - if (tp->tp_threshold == 0) { - /* zombie entry */ - if (outgoing) { - /* only cull zombies on outgoing tests, - * since we may be at interrupt priority on - * incoming messages. */ - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); - } - continue; - } - - if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ - nid == tp->tp_nid) { /* fail this peer */ - fail = 1; - - if (tp->tp_threshold != PTL_MD_THRESH_INF) { - tp->tp_threshold--; - if (outgoing && - tp->tp_threshold == 0) { - /* see above */ - list_del (&tp->tp_list); - list_add (&tp->tp_list, &cull); - } - } - break; - } - } - - LIB_UNLOCK (nal, flags); - - while (!list_empty (&cull)) { - tp = list_entry (cull.next, lib_test_peer_t, tp_list); - list_del (&tp->tp_list); - - PORTAL_FREE(tp, sizeof (*tp)); - } - - return (fail); -} - -ptl_size_t -lib_iov_nob (int niov, struct iovec *iov) -{ - ptl_size_t nob = 0; - - while (niov-- > 0) - nob += (iov++)->iov_len; - - return (nob); -} - -void -lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, - ptl_size_t offset, ptl_size_t len) -{ - ptl_size_t nob; - - if (len == 0) - return; - - /* skip complete frags before 'offset' */ - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - LASSERT (niov > 0); - nob = MIN (iov->iov_len - offset, len); - memcpy (dest, iov->iov_base + offset, nob); - - len -= nob; - dest += nob; - niov--; - iov++; - offset = 0; - } while (len > 0); -} - -void -lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, - char *src, ptl_size_t len) -{ - ptl_size_t nob; - - if (len == 0) - return; - - /* skip complete frags before 'offset' */ - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - LASSERT (niov > 0); - nob = MIN (iov->iov_len - offset, len); - memcpy (iov->iov_base + offset, src, nob); - - len -= nob; - src += nob; - niov--; - iov++; - offset = 0; - } while (len > 0); -} - -int -lib_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - ptl_size_t offset, ptl_size_t len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - ptl_size_t frag_len; - int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT (src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT (src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT (src_niov > 0); - LASSERT (niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} - -#ifndef __KERNEL__ -ptl_size_t -lib_kiov_nob (int niov, ptl_kiov_t *kiov) -{ - LASSERT (0); - return (0); -} - -void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len) -{ - LASSERT (0); -} - -void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len) -{ - LASSERT (0); -} - -int -lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len) -{ - LASSERT (0); -} - -#else - -ptl_size_t -lib_kiov_nob (int niov, ptl_kiov_t *kiov) -{ - ptl_size_t nob = 0; - - while (niov-- > 0) - nob += (kiov++)->kiov_len; - - return (nob); -} - -void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len) -{ - ptl_size_t nob; - char *addr; - - if (len == 0) - return; - - LASSERT (!in_interrupt ()); - - LASSERT (niov > 0); - while (offset > kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - - do{ - LASSERT (niov > 0); - nob = MIN (kiov->kiov_len - offset, len); - - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; - memcpy (dest, addr, nob); - kunmap (kiov->kiov_page); - - len -= nob; - dest += nob; - niov--; - kiov++; - offset = 0; - } while (len > 0); -} - -void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len) -{ - ptl_size_t nob; - char *addr; - - if (len == 0) - return; - - LASSERT (!in_interrupt ()); - - LASSERT (niov > 0); - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - - do { - LASSERT (niov > 0); - nob = MIN (kiov->kiov_len - offset, len); - - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; - memcpy (addr, src, nob); - kunmap (kiov->kiov_page); - - len -= nob; - src += nob; - niov--; - kiov++; - offset = 0; - } while (len > 0); -} - -int -lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - ptl_size_t frag_len; - int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT (src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; - src_niov--; - src++; - LASSERT (src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT (src_niov > 0); - LASSERT (niov <= dst_niov); - - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; - - if (len <= frag_len) { - dst->kiov_len = len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); - return (niov); - } - - dst->kiov_len = frag_len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -#endif - -ptl_err_t -lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) -{ - if (mlen == 0) - return (nal->libnal_recv(nal, private, msg, - 0, NULL, - offset, mlen, rlen)); - - if ((md->options & PTL_MD_KIOV) == 0) - return (nal->libnal_recv(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen)); - - return (nal->libnal_recv_pages(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen)); -} - -ptl_err_t -lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len) -{ - if (len == 0) - return (nal->libnal_send(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len)); - - if ((md->options & PTL_MD_KIOV) == 0) - return (nal->libnal_send(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len)); - - return (nal->libnal_send_pages(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len)); -} - -static void -lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg) -{ - /* ALWAYS called holding the LIB_LOCK */ - lib_counters_t *counters = &nal->libnal_ni.ni_counters; - - /* Here, we commit the MD to a network OP by marking it busy and - * decrementing its threshold. Come what may, the network "owns" - * the MD until a call to lib_finalize() signals completion. */ - msg->md = md; - - md->pending++; - if (md->threshold != PTL_MD_THRESH_INF) { - LASSERT (md->threshold > 0); - md->threshold--; - } - - counters->msgs_alloc++; - if (counters->msgs_alloc > counters->msgs_max) - counters->msgs_max = counters->msgs_alloc; - - list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs); -} - -static void -lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr) -{ - unsigned long flags; - - /* CAVEAT EMPTOR: this only drops messages that we've not committed - * to receive (init_msg() not called) and therefore can't cause an - * event. */ - - LIB_LOCK(nal, flags); - nal->libnal_ni.ni_counters.drop_count++; - nal->libnal_ni.ni_counters.drop_length += hdr->payload_length; - LIB_UNLOCK(nal, flags); - - /* NULL msg => if NAL calls lib_finalize it will be a noop */ - (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); -} - -/* - * Incoming messages have a ptl_msg_t object associated with them - * by the library. This object encapsulates the state of the - * message and allows the NAL to do non-blocking receives or sends - * of long messages. - * - */ -static ptl_err_t -parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) -{ - lib_ni_t *ni = &nal->libnal_ni; - ptl_size_t mlength = 0; - ptl_size_t offset = 0; - ptl_err_t rc; - lib_md_t *md; - unsigned long flags; - - /* Convert put fields to host byte order */ - hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); - hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); - hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); - - LIB_LOCK(nal, flags); - - md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, - hdr->src_nid, hdr->src_pid, - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.match_bits, msg, - &mlength, &offset); - if (md == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - - msg->ev.type = PTL_EVENT_PUT_END; - msg->ev.hdr_data = hdr->msg.put.hdr_data; - - if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && - !(md->options & PTL_MD_ACK_DISABLE)) { - msg->ack_wmd = hdr->msg.put.ack_wmd; - } - - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += mlength; - - LIB_UNLOCK(nal, flags); - - rc = lib_recv(nal, private, msg, md, offset, mlength, - hdr->payload_length); - if (rc != PTL_OK) - CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); - - return (rc); -} - -static ptl_err_t -parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) -{ - lib_ni_t *ni = &nal->libnal_ni; - ptl_size_t mlength = 0; - ptl_size_t offset = 0; - lib_md_t *md; - ptl_hdr_t reply; - unsigned long flags; - int rc; - - /* Convert get fields to host byte order */ - hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); - hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); - hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); - hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); - - LIB_LOCK(nal, flags); - - md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, - hdr->src_nid, hdr->src_pid, - hdr->msg.get.sink_length, hdr->msg.get.src_offset, - hdr->msg.get.match_bits, msg, - &mlength, &offset); - if (md == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - - msg->ev.type = PTL_EVENT_GET_END; - msg->ev.hdr_data = 0; - - ni->ni_counters.send_count++; - ni->ni_counters.send_length += mlength; - - LIB_UNLOCK(nal, flags); - - memset (&reply, 0, sizeof (reply)); - reply.type = cpu_to_le32(PTL_MSG_REPLY); - reply.dest_nid = cpu_to_le64(hdr->src_nid); - reply.dest_pid = cpu_to_le32(hdr->src_pid); - reply.src_nid = cpu_to_le64(ni->ni_pid.nid); - reply.src_pid = cpu_to_le32(ni->ni_pid.pid); - reply.payload_length = cpu_to_le32(mlength); - - reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; - - /* NB call lib_send() _BEFORE_ lib_recv() completes the incoming - * message. Some NALs _require_ this to implement optimized GET */ - - rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, - hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != PTL_OK) - CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); - - /* Discard any junk after the hdr */ - (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); - - return (rc); -} - -static ptl_err_t -parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) -{ - lib_ni_t *ni = &nal->libnal_ni; - lib_md_t *md; - int rlength; - int length; - unsigned long flags; - ptl_err_t rc; - - LIB_LOCK(nal, flags); - - /* NB handles only looked up by creator (no flips) */ - md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); - if (md == NULL || md->threshold == 0) { - CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", - ni->ni_pid.nid, hdr->src_nid, - md == NULL ? "invalid" : "inactive", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - - LASSERT (md->offset == 0); - - length = rlength = hdr->payload_length; - - if (length > md->length) { - if ((md->options & PTL_MD_TRUNCATE) == 0) { - CERROR (LPU64": Dropping REPLY from "LPU64 - " length %d for MD "LPX64" would overflow (%d)\n", - ni->ni_pid.nid, hdr->src_nid, length, - hdr->msg.reply.dst_wmd.wh_object_cookie, - md->length); - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - length = md->length; - } - - CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", - hdr->src_nid, length, rlength, - hdr->msg.reply.dst_wmd.wh_object_cookie); - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_REPLY_END; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.rlength = rlength; - msg->ev.mlength = length; - msg->ev.offset = 0; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += length; - - LIB_UNLOCK(nal, flags); - - rc = lib_recv(nal, private, msg, md, 0, length, rlength); - if (rc != PTL_OK) - CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); - - return (rc); -} - -static ptl_err_t -parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) -{ - lib_ni_t *ni = &nal->libnal_ni; - lib_md_t *md; - unsigned long flags; - - /* Convert ack fields to host byte order */ - hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); - hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); - - LIB_LOCK(nal, flags); - - /* NB handles only looked up by creator (no flips) */ - md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); - if (md == NULL || md->threshold == 0) { - CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " - LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, - (md == NULL) ? "invalid" : "inactive", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie); - - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - - CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", - ni->ni_pid.nid, hdr->src_nid, - hdr->msg.ack.dst_wmd.wh_object_cookie); - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_ACK; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.mlength = hdr->msg.ack.mlength; - msg->ev.match_bits = hdr->msg.ack.match_bits; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - ni->ni_counters.recv_count++; - - LIB_UNLOCK(nal, flags); - - /* We have received and matched up the ack OK, create the - * completion event now... */ - lib_finalize(nal, private, msg, PTL_OK); - - /* ...and now discard any junk after the hdr */ - (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); - - return (PTL_OK); -} - -static char * -hdr_type_string (ptl_hdr_t *hdr) -{ - switch (hdr->type) { - case PTL_MSG_ACK: - return ("ACK"); - case PTL_MSG_PUT: - return ("PUT"); - case PTL_MSG_GET: - return ("GET"); - case PTL_MSG_REPLY: - return ("REPLY"); - case PTL_MSG_HELLO: - return ("HELLO"); - default: - return (""); - } -} - -void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) -{ - char *type_str = hdr_type_string (hdr); - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid); - CWARN(" To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid); - - switch (hdr->type) { - default: - break; - - case PTL_MSG_PUT: - CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data "LPX64"\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case PTL_MSG_GET: - CWARN(" Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case PTL_MSG_ACK: - CWARN(" dst md "LPX64"."LPX64", " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case PTL_MSG_REPLY: - CWARN(" dst md "LPX64"."LPX64", " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } - -} /* end of print_hdr() */ - - -ptl_err_t -lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private) -{ - unsigned long flags; - ptl_err_t rc; - lib_msg_t *msg; - - /* NB we return PTL_OK if we manage to parse the header and believe - * it looks OK. Anything that goes wrong with receiving the - * message after that point is the responsibility of the NAL */ - - /* convert common fields to host byte order */ - hdr->type = le32_to_cpu(hdr->type); - hdr->src_nid = le64_to_cpu(hdr->src_nid); - hdr->src_pid = le32_to_cpu(hdr->src_pid); - hdr->dest_pid = le32_to_cpu(hdr->dest_pid); - hdr->payload_length = le32_to_cpu(hdr->payload_length); - - switch (hdr->type) { - case PTL_MSG_HELLO: { - /* dest_nid is really ptl_magicversion_t */ - ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; - - mv->magic = le32_to_cpu(mv->magic); - mv->version_major = le16_to_cpu(mv->version_major); - mv->version_minor = le16_to_cpu(mv->version_minor); - - if (mv->magic == PORTALS_PROTO_MAGIC && - mv->version_major == PORTALS_PROTO_VERSION_MAJOR && - mv->version_minor == PORTALS_PROTO_VERSION_MINOR) { - CWARN (LPU64": Dropping unexpected HELLO message: " - "magic %d, version %d.%d from "LPD64"\n", - nal->libnal_ni.ni_pid.nid, mv->magic, - mv->version_major, mv->version_minor, - hdr->src_nid); - - /* it's good but we don't want it */ - lib_drop_message(nal, private, hdr); - return PTL_OK; - } - - /* we got garbage */ - CERROR (LPU64": Bad HELLO message: " - "magic %d, version %d.%d from "LPD64"\n", - nal->libnal_ni.ni_pid.nid, mv->magic, - mv->version_major, mv->version_minor, - hdr->src_nid); - return PTL_FAIL; - } - - case PTL_MSG_ACK: - case PTL_MSG_PUT: - case PTL_MSG_GET: - case PTL_MSG_REPLY: - hdr->dest_nid = le64_to_cpu(hdr->dest_nid); - if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) { - CERROR(LPU64": BAD dest NID in %s message from" - LPU64" to "LPU64" (not me)\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid, hdr->dest_nid); - return PTL_FAIL; - } - break; - - default: - CERROR(LPU64": Bad message type 0x%x from "LPU64"\n", - nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid); - return PTL_FAIL; - } - - /* We've decided we're not receiving garbage since we can parse the - * header. We will return PTL_OK come what may... */ - - if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */ - fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ - { - CERROR(LPU64": Dropping incoming %s from "LPU64 - ": simulated failure\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid); - lib_drop_message(nal, private, hdr); - return PTL_OK; - } - - msg = lib_msg_alloc(nal); - if (msg == NULL) { - CERROR(LPU64": Dropping incoming %s from "LPU64 - ": can't allocate a lib_msg_t\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid); - lib_drop_message(nal, private, hdr); - return PTL_OK; - } - - switch (hdr->type) { - case PTL_MSG_ACK: - rc = parse_ack(nal, hdr, private, msg); - break; - case PTL_MSG_PUT: - rc = parse_put(nal, hdr, private, msg); - break; - case PTL_MSG_GET: - rc = parse_get(nal, hdr, private, msg); - break; - case PTL_MSG_REPLY: - rc = parse_reply(nal, hdr, private, msg); - break; - default: - LASSERT(0); - rc = PTL_FAIL; /* no compiler warning please */ - break; - } - - if (rc != PTL_OK) { - if (msg->md != NULL) { - /* committed... */ - lib_finalize(nal, private, msg, rc); - } else { - LIB_LOCK(nal, flags); - lib_msg_free(nal, msg); /* expects LIB_LOCK held */ - LIB_UNLOCK(nal, flags); - - lib_drop_message(nal, private, hdr); - } - } - - return PTL_OK; - /* That's "OK I can parse it", not "OK I like it" :) */ -} - -int -lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_ack_req_t ack, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, - ptl_size_t offset, ptl_hdr_data_t hdr_data) -{ - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg; - ptl_hdr_t hdr; - lib_md_t *md; - unsigned long flags; - int rc; - - if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ - fail_peer (nal, id->nid, 1)) /* shall we now? */ - { - CERROR("Dropping PUT to "LPU64": simulated failure\n", - id->nid); - return PTL_PROCESS_INVALID; - } - - msg = lib_msg_alloc(nal); - if (msg == NULL) { - CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", - ni->ni_pid.nid, id->nid); - return PTL_NO_SPACE; - } - - LIB_LOCK(nal, flags); - - md = ptl_handle2md(mdh, nal); - if (md == NULL || md->threshold == 0) { - lib_msg_free(nal, msg); - LIB_UNLOCK(nal, flags); - - return PTL_MD_INVALID; - } - - CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid); - - memset (&hdr, 0, sizeof (hdr)); - hdr.type = cpu_to_le32(PTL_MSG_PUT); - hdr.dest_nid = cpu_to_le64(id->nid); - hdr.dest_pid = cpu_to_le32(id->pid); - hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); - hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); - hdr.payload_length = cpu_to_le32(md->length); - - /* NB handles only looked up by creator (no flips) */ - if (ack == PTL_ACK_REQ) { - hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; - hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; - } else { - hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; - } - - hdr.msg.put.match_bits = cpu_to_le64(match_bits); - hdr.msg.put.ptl_index = cpu_to_le32(portal); - hdr.msg.put.offset = cpu_to_le32(offset); - hdr.msg.put.hdr_data = hdr_data; - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->ni_pid.nid; - msg->ev.initiator.pid = ni->ni_pid.pid; - msg->ev.pt_index = portal; - msg->ev.match_bits = match_bits; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = offset; - msg->ev.hdr_data = hdr_data; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - ni->ni_counters.send_count++; - ni->ni_counters.send_length += md->length; - - LIB_UNLOCK(nal, flags); - - rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT, - id->nid, id->pid, md, 0, md->length); - if (rc != PTL_OK) { - CERROR("Error sending PUT to "LPX64": %d\n", - id->nid, rc); - lib_finalize (nal, NULL, msg, rc); - } - - /* completion will be signalled by an event */ - return PTL_OK; -} - -lib_msg_t * -lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) -{ - /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg for the NAL to pass to lib_finalize() when the sink - * data has been received. - * - * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when - * lib_finalize() is called on it, so the NAL must call this first */ - - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg = lib_msg_alloc(nal); - lib_md_t *getmd = getmsg->md; - unsigned long flags; - - LIB_LOCK(nal, flags); - - LASSERT (getmd->pending > 0); - - if (msg == NULL) { - CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n", - peer_nid); - goto drop; - } - - if (getmd->threshold == 0) { - CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n", - peer_nid, getmd); - goto drop_msg; - } - - LASSERT (getmd->offset == 0); - - CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd); - - lib_commit_md (nal, getmd, msg); - - msg->ev.type = PTL_EVENT_REPLY_END; - msg->ev.initiator.nid = peer_nid; - msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ - msg->ev.rlength = msg->ev.mlength = getmd->length; - msg->ev.offset = 0; - - lib_md_deconstruct(nal, getmd, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, getmd); - - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += getmd->length; - - LIB_UNLOCK(nal, flags); - - return msg; - - drop_msg: - lib_msg_free(nal, msg); - drop: - nal->libnal_ni.ni_counters.drop_count++; - nal->libnal_ni.ni_counters.drop_length += getmd->length; - - LIB_UNLOCK (nal, flags); - - return NULL; -} - -int -lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, ptl_size_t offset) -{ - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg; - ptl_hdr_t hdr; - lib_md_t *md; - unsigned long flags; - int rc; - - if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ - fail_peer (nal, id->nid, 1)) /* shall we now? */ - { - CERROR("Dropping PUT to "LPX64": simulated failure\n", - id->nid); - return PTL_PROCESS_INVALID; - } - - msg = lib_msg_alloc(nal); - if (msg == NULL) { - CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", - id->nid); - return PTL_NO_SPACE; - } - - LIB_LOCK(nal, flags); - - md = ptl_handle2md(mdh, nal); - if (md == NULL || !md->threshold) { - lib_msg_free(nal, msg); - LIB_UNLOCK(nal, flags); - - return PTL_MD_INVALID; - } - - CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, - (unsigned long)id->pid); - - memset (&hdr, 0, sizeof (hdr)); - hdr.type = cpu_to_le32(PTL_MSG_GET); - hdr.dest_nid = cpu_to_le64(id->nid); - hdr.dest_pid = cpu_to_le32(id->pid); - hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); - hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); - hdr.payload_length = 0; - - /* NB handles only looked up by creator (no flips) */ - hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; - hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; - - hdr.msg.get.match_bits = cpu_to_le64(match_bits); - hdr.msg.get.ptl_index = cpu_to_le32(portal); - hdr.msg.get.src_offset = cpu_to_le32(offset); - hdr.msg.get.sink_length = cpu_to_le32(md->length); - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator = ni->ni_pid; - msg->ev.pt_index = portal; - msg->ev.match_bits = match_bits; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = offset; - msg->ev.hdr_data = 0; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - ni->ni_counters.send_count++; - - LIB_UNLOCK(nal, flags); - - rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET, - id->nid, id->pid, NULL, 0, 0); - if (rc != PTL_OK) { - CERROR(LPU64": error sending GET to "LPU64": %d\n", - ni->ni_pid.nid, id->nid, rc); - lib_finalize (nal, NULL, msg, rc); - } - - /* completion will be signalled by an event */ - return PTL_OK; -} - -void lib_assert_wire_constants (void) -{ - /* Wire protocol assertions generated by 'wirecheck' - * running on Linux mdevi 2.4.21-p4smp-55chaos #1 SMP Tue Jun 8 14:38:44 PDT 2004 i686 i686 i - * with gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34) */ - - - /* Constants... */ - LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded); - LASSERT (PORTALS_PROTO_VERSION_MAJOR == 1); - LASSERT (PORTALS_PROTO_VERSION_MINOR == 0); - LASSERT (PTL_MSG_ACK == 0); - LASSERT (PTL_MSG_PUT == 1); - LASSERT (PTL_MSG_GET == 2); - LASSERT (PTL_MSG_REPLY == 3); - LASSERT (PTL_MSG_HELLO == 4); - - /* Checks for struct ptl_handle_wire_t */ - LASSERT ((int)sizeof(ptl_handle_wire_t) == 16); - LASSERT ((int)offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0); - LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8); - LASSERT ((int)offsetof(ptl_handle_wire_t, wh_object_cookie) == 8); - LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_object_cookie) == 8); - - /* Checks for struct ptl_magicversion_t */ - LASSERT ((int)sizeof(ptl_magicversion_t) == 8); - LASSERT ((int)offsetof(ptl_magicversion_t, magic) == 0); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->magic) == 4); - LASSERT ((int)offsetof(ptl_magicversion_t, version_major) == 4); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_major) == 2); - LASSERT ((int)offsetof(ptl_magicversion_t, version_minor) == 6); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_minor) == 2); - - /* Checks for struct ptl_hdr_t */ - LASSERT ((int)sizeof(ptl_hdr_t) == 72); - LASSERT ((int)offsetof(ptl_hdr_t, dest_nid) == 0); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_nid) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, src_nid) == 8); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_nid) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, dest_pid) == 16); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_pid) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, src_pid) == 20); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_pid) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, type) == 24); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->type) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, payload_length) == 28); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->payload_length) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg) == 40); - - /* Ack */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.mlength) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.mlength) == 4); - - /* Put */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.hdr_data) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.hdr_data) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ptl_index) == 64); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ptl_index) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.offset) == 68); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.offset) == 4); - - /* Get */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.return_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.return_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.ptl_index) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.ptl_index) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.src_offset) == 60); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.src_offset) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.sink_length) == 64); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.sink_length) == 4); - - /* Reply */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16); - - /* Hello */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.incarnation) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.incarnation) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.type) == 40); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.type) == 4); -} diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c deleted file mode 100644 index 54e89bc..0000000 --- a/lustre/portals/portals/lib-msg.c +++ /dev/null @@ -1,147 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-msg.c - * Message decoding, parsing and finalizing routines - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef __KERNEL__ -# include -#else -# define DEBUG_SUBSYSTEM S_PORTALS -# include -#endif - -#include - -void -lib_enq_event_locked (lib_nal_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev) -{ - ptl_event_t *eq_slot; - - /* Allocate the next queue slot */ - ev->link = ev->sequence = eq->eq_enq_seq++; - /* NB we don't support START events yet and we don't create a separate - * UNLINK event unless an explicit unlink succeeds, so the link - * sequence is pretty useless */ - - /* We don't support different uid/jids yet */ - ev->uid = 0; - ev->jid = 0; - - /* size must be a power of 2 to handle sequence # overflow */ - LASSERT (eq->eq_size != 0 && - eq->eq_size == LOWEST_BIT_SET (eq->eq_size)); - eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1)); - - /* There is no race since both event consumers and event producers - * take the LIB_LOCK(), so we don't screw around with memory - * barriers, setting the sequence number last or wierd structure - * layout assertions. */ - *eq_slot = *ev; - - /* Call the callback handler (if any) */ - if (eq->eq_callback != NULL) - eq->eq_callback (eq_slot); - - /* Wake anyone sleeping for an event (see lib-eq.c) */ -#ifdef __KERNEL__ - if (waitqueue_active(&nal->libnal_ni.ni_waitq)) - wake_up_all(&nal->libnal_ni.ni_waitq); -#else - pthread_cond_broadcast(&nal->libnal_ni.ni_cond); -#endif -} - -void -lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) -{ - lib_md_t *md; - int unlink; - unsigned long flags; - int rc; - ptl_hdr_t ack; - - if (msg == NULL) - return; - - /* Only send an ACK if the PUT completed successfully */ - if (status == PTL_OK && - !ptl_is_wire_handle_none(&msg->ack_wmd)) { - - LASSERT(msg->ev.type == PTL_EVENT_PUT_END); - - memset (&ack, 0, sizeof (ack)); - ack.type = cpu_to_le32(PTL_MSG_ACK); - ack.dest_nid = cpu_to_le64(msg->ev.initiator.nid); - ack.dest_pid = cpu_to_le32(msg->ev.initiator.pid); - ack.src_nid = cpu_to_le64(nal->libnal_ni.ni_pid.nid); - ack.src_pid = cpu_to_le32(nal->libnal_ni.ni_pid.pid); - ack.payload_length = 0; - - ack.msg.ack.dst_wmd = msg->ack_wmd; - ack.msg.ack.match_bits = msg->ev.match_bits; - ack.msg.ack.mlength = cpu_to_le32(msg->ev.mlength); - - rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, - msg->ev.initiator.nid, msg->ev.initiator.pid, - NULL, 0, 0); - if (rc != PTL_OK) { - /* send failed: there's nothing else to clean up. */ - CERROR("Error %d sending ACK to "LPX64"\n", - rc, msg->ev.initiator.nid); - } - } - - md = msg->md; - - LIB_LOCK(nal, flags); - - /* Now it's safe to drop my caller's ref */ - md->pending--; - LASSERT (md->pending >= 0); - - /* Should I unlink this MD? */ - if (md->pending != 0) /* other refs */ - unlink = 0; - else if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) != 0) - unlink = 1; - else if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) == 0) - unlink = 0; - else - unlink = lib_md_exhausted(md); - - msg->ev.ni_fail_type = status; - msg->ev.unlinked = unlink; - - if (md->eq != NULL) - lib_enq_event_locked(nal, private, md->eq, &msg->ev); - - if (unlink) - lib_md_unlink(nal, md); - - list_del (&msg->msg_list); - nal->libnal_ni.ni_counters.msgs_alloc--; - lib_msg_free(nal, msg); - - LIB_UNLOCK(nal, flags); -} diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c deleted file mode 100644 index 0f298a0..0000000 --- a/lustre/portals/portals/lib-ni.c +++ /dev/null @@ -1,72 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-ni.c - * Network status registers and distance functions. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * Copyright (c) 2001-2002 Sandia National Laboratories - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -#define MAX_DIST 18446744073709551615ULL - -int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx, - ptl_sr_value_t *status) -{ - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_counters_t *count = &ni->ni_counters; - - switch (sr_idx) { - case PTL_SR_DROP_COUNT: - *status = count->drop_count; - return PTL_OK; - case PTL_SR_DROP_LENGTH: - *status = count->drop_length; - return PTL_OK; - case PTL_SR_RECV_COUNT: - *status = count->recv_count; - return PTL_OK; - case PTL_SR_RECV_LENGTH: - *status = count->recv_length; - return PTL_OK; - case PTL_SR_SEND_COUNT: - *status = count->send_count; - return PTL_OK; - case PTL_SR_SEND_LENGTH: - *status = count->send_length; - return PTL_OK; - case PTL_SR_MSGS_MAX: - *status = count->msgs_max; - return PTL_OK; - default: - *status = 0; - return PTL_SR_INDEX_INVALID; - } -} - - -int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist) -{ - lib_nal_t *nal = apinal->nal_data; - - return (nal->libnal_dist(nal, pid->nid, dist)); -} diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c deleted file mode 100644 index ff2a601..0000000 --- a/lustre/portals/portals/lib-pid.c +++ /dev/null @@ -1,46 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-pid.c - * - * Process identification routines - * Copyright (C) 2001-2003 Cluster File Systems, Inc. - * Copyright (C) 2001-2003 Cluster File Systems, Inc. - * - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* This should be removed. The NAL should have the PID information */ -#define DEBUG_SUBSYSTEM S_PORTALS - -#if defined (__KERNEL__) -# include -extern int getpid(void); -#else -# include -# include -#endif -#include - -int -lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid) -{ - lib_nal_t *nal = apinal->nal_data; - - *pid = nal->libnal_ni.ni_pid; - return PTL_OK; -} diff --git a/lustre/portals/portals/module.c b/lustre/portals/portals/module.c deleted file mode 100644 index c1303b7..0000000 --- a/lustre/portals/portals/module.c +++ /dev/null @@ -1,201 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#define DEBUG_SUBSYSTEM S_PORTALS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -extern void (kping_client)(struct portal_ioctl_data *); - -static int kportal_ioctl(struct portal_ioctl_data *data, - unsigned int cmd, unsigned long arg) -{ - int err; - char str[PTL_NALFMT_SIZE]; - ENTRY; - - switch (cmd) { - case IOC_PORTAL_PING: { - void (*ping)(struct portal_ioctl_data *); - - CDEBUG(D_IOCTL, "doing %d pings to nid "LPX64" (%s)\n", - data->ioc_count, data->ioc_nid, - portals_nid2str(data->ioc_nal, data->ioc_nid, str)); - ping = PORTAL_SYMBOL_GET(kping_client); - if (!ping) - CERROR("PORTAL_SYMBOL_GET failed\n"); - else { - ping(data); - PORTAL_SYMBOL_PUT(kping_client); - } - RETURN(0); - } - - case IOC_PORTAL_GET_NID: { - ptl_handle_ni_t nih; - ptl_process_id_t pid; - - CDEBUG (D_IOCTL, "Getting nid for nal [%x]\n", data->ioc_nal); - - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, - NULL, &nih); - if (!(err == PTL_OK || err == PTL_IFACE_DUP)) - RETURN (-EINVAL); - - err = PtlGetId (nih, &pid); - LASSERT (err == PTL_OK); - - PtlNIFini(nih); - - data->ioc_nid = pid.nid; - if (copy_to_user ((char *)arg, data, sizeof (*data))) - RETURN (-EFAULT); - RETURN(0); - } - - case IOC_PORTAL_FAIL_NID: { - ptl_handle_ni_t nih; - - CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", - data->ioc_nal, data->ioc_nid, data->ioc_count); - - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, - NULL, &nih); - if (!(err == PTL_OK || err == PTL_IFACE_DUP)) - return (-EINVAL); - - if (err == PTL_OK) { - /* There's no point in failing an interface that - * came into existance just for this */ - err = -EINVAL; - } else { - err = PtlFailNid (nih, data->ioc_nid, data->ioc_count); - if (err != PTL_OK) - err = -EINVAL; - } - - PtlNIFini(nih); - RETURN (err); - } - default: - RETURN(-EINVAL); - } - /* Not Reached */ -} - -DECLARE_IOCTL_HANDLER(kportal_ioctl_handler, kportal_ioctl); - -static int init_kportals_module(void) -{ - int rc; - ENTRY; - - rc = PtlInit(NULL); - if (rc) { - CERROR("PtlInit: error %d\n", rc); - RETURN(rc); - } - - rc = libcfs_register_ioctl(&kportal_ioctl_handler); - LASSERT (rc == 0); - - RETURN(rc); -} - -static void exit_kportals_module(void) -{ - int rc; - - rc = libcfs_deregister_ioctl(&kportal_ioctl_handler); - LASSERT (rc == 0); - - PtlFini(); -} - -EXPORT_SYMBOL(ptl_register_nal); -EXPORT_SYMBOL(ptl_unregister_nal); - -EXPORT_SYMBOL(ptl_err_str); -EXPORT_SYMBOL(PtlMEAttach); -EXPORT_SYMBOL(PtlMEInsert); -EXPORT_SYMBOL(PtlMEUnlink); -EXPORT_SYMBOL(PtlEQAlloc); -EXPORT_SYMBOL(PtlMDAttach); -EXPORT_SYMBOL(PtlMDUnlink); -EXPORT_SYMBOL(PtlNIInit); -EXPORT_SYMBOL(PtlNIFini); -EXPORT_SYMBOL(PtlInit); -EXPORT_SYMBOL(PtlFini); -EXPORT_SYMBOL(PtlSnprintHandle); -EXPORT_SYMBOL(PtlPut); -EXPORT_SYMBOL(PtlGet); -EXPORT_SYMBOL(PtlEQWait); -EXPORT_SYMBOL(PtlEQFree); -EXPORT_SYMBOL(PtlEQGet); -EXPORT_SYMBOL(PtlGetId); -EXPORT_SYMBOL(PtlMDBind); -EXPORT_SYMBOL(lib_iov_nob); -EXPORT_SYMBOL(lib_copy_iov2buf); -EXPORT_SYMBOL(lib_copy_buf2iov); -EXPORT_SYMBOL(lib_extract_iov); -EXPORT_SYMBOL(lib_kiov_nob); -EXPORT_SYMBOL(lib_copy_kiov2buf); -EXPORT_SYMBOL(lib_copy_buf2kiov); -EXPORT_SYMBOL(lib_extract_kiov); -EXPORT_SYMBOL(lib_finalize); -EXPORT_SYMBOL(lib_parse); -EXPORT_SYMBOL(lib_create_reply_msg); -EXPORT_SYMBOL(lib_init); -EXPORT_SYMBOL(lib_fini); - -MODULE_AUTHOR("Peter J. Braam "); -MODULE_DESCRIPTION("Portals v3.1"); -MODULE_LICENSE("GPL"); -module_init(init_kportals_module); -module_exit(exit_kportals_module); diff --git a/lustre/portals/router/.cvsignore b/lustre/portals/router/.cvsignore deleted file mode 100644 index 5ed596b..0000000 --- a/lustre/portals/router/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -.deps -Makefile -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/router/Makefile.in b/lustre/portals/router/Makefile.in deleted file mode 100644 index 3bb6cf7..0000000 --- a/lustre/portals/router/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := kptlrouter -kptlrouter-objs := router.o proc.o - -@INCLUDE_RULES@ diff --git a/lustre/portals/router/Makefile.mk b/lustre/portals/router/Makefile.mk deleted file mode 100644 index 9b02c03..0000000 --- a/lustre/portals/router/Makefile.mk +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../Kernelenv - -obj-y += kptlrouter.o -kptlrouter-objs := router.o proc.o diff --git a/lustre/portals/router/autoMakefile.am b/lustre/portals/router/autoMakefile.am deleted file mode 100644 index fa11e8c..0000000 --- a/lustre/portals/router/autoMakefile.am +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -modulenet_DATA = kptlrouter$(KMODEXT) -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(kptlrouter-objs:%.o=%.c) router.h diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c deleted file mode 100644 index 61b6880..0000000 --- a/lustre/portals/router/proc.c +++ /dev/null @@ -1,242 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "router.h" - -#define KPR_PROC_ROUTER "sys/portals/router" -#define KPR_PROC_ROUTES "sys/portals/routes" - -/* Used for multi-page route list book keeping */ -struct proc_route_data { - struct list_head *curr; - unsigned int generation; - off_t skip; - rwlock_t proc_route_rwlock; -} kpr_read_routes_data; - -/* nal2name support re-used from utils/portals.c */ -struct name2num { - char *name; - int num; -} nalnames[] = { - { "any", 0}, - { "elan", QSWNAL}, - { "tcp", SOCKNAL}, - { "gm", GMNAL}, - { "ib", OPENIBNAL}, - { "iib", IIBNAL}, - { "lo", LONAL}, - { NULL, -1} -}; - -static struct name2num *name2num_lookup_num(struct name2num *table, int num) -{ - while (table->name != NULL) - if (num == table->num) - return (table); - else - table++; - return (NULL); -} - -static char *nal2name(int nal) -{ - struct name2num *e = name2num_lookup_num(nalnames, nal); - return ((e == NULL) ? "???" : e->name); -} - - -static int kpr_proc_router_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - unsigned long long bytes = kpr_fwd_bytes; - unsigned long packets = kpr_fwd_packets; - unsigned long errors = kpr_fwd_errors; - unsigned int qdepth = atomic_read (&kpr_queue_depth); - int len; - - *eof = 1; - if (off != 0) - return (0); - - len = sprintf(page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); - - *start = page; - return (len); -} - -static int kpr_proc_router_write(struct file *file, const char *ubuffer, - unsigned long count, void *data) -{ - /* Ignore what we've been asked to write, and just zero the stats */ - kpr_fwd_bytes = 0; - kpr_fwd_packets = 0; - kpr_fwd_errors = 0; - - return (count); -} - -static int kpr_proc_routes_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct proc_route_data *prd = data; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; - int chunk_len = 0; - int line_len = 0; - int user_len = 0; - int rc = 0; - - *eof = 1; - *start = page; - - write_lock(&(prd->proc_route_rwlock)); - - if (prd->curr == NULL) { - if (off != 0) - goto routes_read_exit; - - /* First pass, initialize our private data */ - prd->curr = kpr_routes.next; - prd->generation = kpr_routes_generation; - prd->skip = 0; - } else { - /* Abort route list generation change */ - if (prd->generation != kpr_routes_generation) { - prd->curr = NULL; - rc = sprintf(page, "\nError: Routes Changed\n"); - goto routes_read_exit; - } - - /* All the routes have been walked */ - if (prd->curr == &kpr_routes) { - prd->curr = NULL; - goto routes_read_exit; - } - } - - read_lock(&kpr_rwlock); - *start = page + prd->skip; - user_len = -prd->skip; - - while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) { - re = list_entry(prd->curr, kpr_route_entry_t, kpre_list); - ge = re->kpre_gateway; - - line_len = sprintf(page + chunk_len, - "%12s "LPX64" : "LPX64" - "LPX64", %s\n", - nal2name(ge->kpge_nalid), ge->kpge_nid, - re->kpre_lo_nid, re->kpre_hi_nid, - ge->kpge_alive ? "up" : "down"); - chunk_len += line_len; - user_len += line_len; - - /* Abort the route list changed */ - if (prd->curr->next == NULL) { - prd->curr = NULL; - read_unlock(&kpr_rwlock); - rc = sprintf(page, "\nError: Routes Changed\n"); - goto routes_read_exit; - } - - prd->curr = prd->curr->next; - - /* The route table will exceed one page, break the while loop - * so the function can be re-called with a new page. - */ - if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) - break; - } - - *eof = 0; - - /* Caller received only a portion of the last entry, the - * remaining will be delivered in the next page if asked for. - */ - if (user_len > count) { - prd->curr = prd->curr->prev; - prd->skip = line_len - (user_len - count); - read_unlock(&kpr_rwlock); - rc = count; - goto routes_read_exit; - } - - /* Not enough data to entirely satify callers request */ - prd->skip = 0; - read_unlock(&kpr_rwlock); - rc = user_len; - -routes_read_exit: - write_unlock(&(prd->proc_route_rwlock)); - return rc; -} - -static int kpr_proc_routes_write(struct file *file, const char *ubuffer, - unsigned long count, void *data) -{ - /* no-op; lctl should be used to adjust the routes */ - return (count); -} - -void kpr_proc_init(void) -{ - struct proc_dir_entry *router_entry; - struct proc_dir_entry *routes_entry; - - /* Initialize KPR_PROC_ROUTER */ - router_entry = create_proc_entry (KPR_PROC_ROUTER, - S_IFREG | S_IRUGO | S_IWUSR, NULL); - - if (router_entry == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); - return; - } - - router_entry->data = NULL; - router_entry->read_proc = kpr_proc_router_read; - router_entry->write_proc = kpr_proc_router_write; - - /* Initialize KPR_PROC_ROUTES */ - routes_entry = create_proc_entry (KPR_PROC_ROUTES, - S_IFREG | S_IRUGO | S_IWUSR, NULL); - - if (routes_entry == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTES); - return; - } - - kpr_read_routes_data.curr = NULL; - kpr_read_routes_data.generation = 0; - kpr_read_routes_data.skip = 0; - kpr_read_routes_data.proc_route_rwlock = RW_LOCK_UNLOCKED; - - routes_entry->data = &kpr_read_routes_data; - routes_entry->read_proc = kpr_proc_routes_read; - routes_entry->write_proc = kpr_proc_routes_write; -} - -void kpr_proc_fini(void) -{ - remove_proc_entry(KPR_PROC_ROUTER, 0); - remove_proc_entry(KPR_PROC_ROUTES, 0); -} diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c deleted file mode 100644 index 7edc5f6..0000000 --- a/lustre/portals/router/router.c +++ /dev/null @@ -1,824 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "router.h" - -LIST_HEAD(kpr_routes); -LIST_HEAD(kpr_gateways); -LIST_HEAD(kpr_nals); - -unsigned int kpr_routes_generation; -unsigned long long kpr_fwd_bytes; -unsigned long kpr_fwd_packets; -unsigned long kpr_fwd_errors; -atomic_t kpr_queue_depth; - -/* Mostly the tables are read-only (thread and interrupt context) - * - * Once in a blue moon we register/deregister NALs and add/remove routing - * entries (thread context only)... */ -rwlock_t kpr_rwlock = RW_LOCK_UNLOCKED; - -kpr_router_interface_t kpr_router_interface = { - kprri_register: kpr_register_nal, - kprri_lookup: kpr_lookup_target, - kprri_fwd_start: kpr_forward_packet, - kprri_fwd_done: kpr_complete_packet, - kprri_notify: kpr_nal_notify, - kprri_shutdown: kpr_shutdown_nal, - kprri_deregister: kpr_deregister_nal, -}; - -int -kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) -{ - unsigned long flags; - struct list_head *e; - kpr_nal_entry_t *ne; - - CDEBUG (D_NET, "Registering NAL %x\n", nalif->kprni_nalid); - - PORTAL_ALLOC (ne, sizeof (*ne)); - if (ne == NULL) - return (-ENOMEM); - - memset (ne, 0, sizeof (*ne)); - memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); - - LASSERT (!in_interrupt()); - write_lock_irqsave (&kpr_rwlock, flags); - - for (e = kpr_nals.next; e != &kpr_nals; e = e->next) - { - kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); - - if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) - { - write_unlock_irqrestore (&kpr_rwlock, flags); - - CERROR ("Attempt to register same NAL %x twice\n", ne->kpne_interface.kprni_nalid); - - PORTAL_FREE (ne, sizeof (*ne)); - return (-EEXIST); - } - } - - list_add (&ne->kpne_list, &kpr_nals); - - write_unlock_irqrestore (&kpr_rwlock, flags); - - *argp = ne; - PORTAL_MODULE_USE; - return (0); -} - -void -kpr_do_upcall (void *arg) -{ - kpr_upcall_t *u = (kpr_upcall_t *)arg; - char nalstr[10]; - char nidstr[36]; - char whenstr[36]; - char *argv[] = { - NULL, - "ROUTER_NOTIFY", - nalstr, - nidstr, - u->kpru_alive ? "up" : "down", - whenstr, - NULL}; - - snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id); - snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid); - snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); - - portals_run_upcall (argv); - - kfree (u); -} - -void -kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when) -{ - char str[PTL_NALFMT_SIZE]; - - /* May be in arbitrary context */ - kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC); - - if (u == NULL) { - CERROR ("Upcall out of memory: nal %x nid "LPX64" (%s) %s\n", - gw_nalid, gw_nid, - portals_nid2str(gw_nalid, gw_nid, str), - alive ? "up" : "down"); - return; - } - - u->kpru_nal_id = gw_nalid; - u->kpru_nid = gw_nid; - u->kpru_alive = alive; - u->kpru_when = when; - - prepare_work (&u->kpru_tq, kpr_do_upcall, u); - schedule_work (&u->kpru_tq); -} - -int -kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, - int alive, time_t when) -{ - unsigned long flags; - int found; - kpr_nal_entry_t *ne = NULL; - kpr_gateway_entry_t *ge = NULL; - struct timeval now; - struct list_head *e; - struct list_head *n; - char str[PTL_NALFMT_SIZE]; - - CDEBUG (D_NET, "%s notifying [%x] "LPX64": %s\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, alive ? "up" : "down"); - - /* can't do predictions... */ - do_gettimeofday (&now); - if (when > now.tv_sec) { - CWARN ("Ignoring prediction from %s of [%x] "LPX64" %s " - "%ld seconds in the future\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, - alive ? "up" : "down", - when - now.tv_sec); - return (EINVAL); - } - - LASSERT (when <= now.tv_sec); - - /* Serialise with lookups (i.e. write lock) */ - write_lock_irqsave(&kpr_rwlock, flags); - - found = 0; - list_for_each_safe (e, n, &kpr_gateways) { - - ge = list_entry(e, kpr_gateway_entry_t, kpge_list); - if ((gateway_nalid != 0 && - ge->kpge_nalid != gateway_nalid) || - ge->kpge_nid != gateway_nid) - continue; - - found = 1; - break; - } - - if (!found) { - /* gateway not found */ - write_unlock_irqrestore(&kpr_rwlock, flags); - CDEBUG (D_NET, "Gateway not found\n"); - return (0); - } - - if (when < ge->kpge_timestamp) { - /* out of date information */ - write_unlock_irqrestore (&kpr_rwlock, flags); - CDEBUG (D_NET, "Out of date\n"); - return (0); - } - - /* update timestamp */ - ge->kpge_timestamp = when; - - if ((!ge->kpge_alive) == (!alive)) { - /* new date for old news */ - write_unlock_irqrestore (&kpr_rwlock, flags); - CDEBUG (D_NET, "Old news\n"); - return (0); - } - - ge->kpge_alive = alive; - CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive); - - if (alive) { - /* Reset all gateway weights so the newly-enabled gateway - * doesn't have to play catch-up */ - list_for_each_safe (e, n, &kpr_gateways) { - kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge->kpge_weight, 0); - } - } - - found = 0; - if (!byNal) { - /* userland notified me: notify NAL? */ - ne = kpr_find_nal_entry_locked (ge->kpge_nalid); - if (ne != NULL) { - if (!ne->kpne_shutdown && - ne->kpne_interface.kprni_notify != NULL) { - /* take a ref on this NAL until notifying - * it has completed... */ - atomic_inc (&ne->kpne_refcount); - found = 1; - } - } - } - - write_unlock_irqrestore(&kpr_rwlock, flags); - - if (found) { - ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg, - gateway_nid, alive); - /* 'ne' can disappear now... */ - atomic_dec (&ne->kpne_refcount); - } - - if (byNal) { - /* It wasn't userland that notified me... */ - CWARN ("Upcall: NAL %x NID "LPX64" (%s) is %s\n", - gateway_nalid, gateway_nid, - portals_nid2str(gateway_nalid, gateway_nid, str), - alive ? "alive" : "dead"); - kpr_upcall (gateway_nalid, gateway_nid, alive, when); - } else { - CDEBUG (D_NET, " NOT Doing upcall\n"); - } - - return (0); -} - -void -kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when) -{ - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when); -} - -void -kpr_shutdown_nal (void *arg) -{ - unsigned long flags; - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - CDEBUG (D_NET, "Shutting down NAL %x\n", ne->kpne_interface.kprni_nalid); - - LASSERT (!ne->kpne_shutdown); - LASSERT (!in_interrupt()); - - write_lock_irqsave (&kpr_rwlock, flags); - ne->kpne_shutdown = 1; - write_unlock_irqrestore (&kpr_rwlock, flags); -} - -void -kpr_deregister_nal (void *arg) -{ - unsigned long flags; - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - CDEBUG (D_NET, "Deregister NAL %x\n", ne->kpne_interface.kprni_nalid); - - LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ - LASSERT (!in_interrupt()); - - write_lock_irqsave (&kpr_rwlock, flags); - list_del (&ne->kpne_list); - write_unlock_irqrestore (&kpr_rwlock, flags); - - /* Wait until all outstanding messages/notifications have completed */ - while (atomic_read (&ne->kpne_refcount) != 0) - { - CDEBUG (D_NET, "Waiting for refcount on NAL %x to reach zero (%d)\n", - ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); - - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - PORTAL_FREE (ne, sizeof (*ne)); - PORTAL_MODULE_UNUSE; -} - -int -kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) -{ - const int significant_bits = 0x00ffffff; - /* We use atomic_t to record/compare route weights for - * load-balancing. Here we limit ourselves to only using - * 'significant_bits' when we do an 'after' comparison */ - - int diff = (atomic_read (&ge1->kpge_weight) - - atomic_read (&ge2->kpge_weight)) & significant_bits; - int rc = (diff > (significant_bits >> 1)); - - CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n", - ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight), - rc ? ">" : "<", - ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight)); - - return (rc); -} - -void -kpr_update_weight (kpr_gateway_entry_t *ge, int nob) -{ - int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t); - - /* We've chosen this route entry (i.e. gateway) to forward payload - * of length 'nob'; update the route's weight to make it less - * favoured. Note that the weight is 1 plus the payload size - * rounded and scaled to the portals header size, so we get better - * use of the significant bits in kpge_weight. */ - - CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge, - ge->kpge_nid, weight); - - atomic_add (weight, &ge->kpge_weight); -} - -int -kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, - ptl_nid_t *gateway_nidp) -{ - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - struct list_head *e; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge = NULL; - int rc = -ENOENT; - - /* Caller wants to know if 'target_nid' can be reached via a gateway - * ON HER OWN NETWORK */ - - CDEBUG (D_NET, "lookup "LPX64" from NAL %x\n", target_nid, - ne->kpne_interface.kprni_nalid); - LASSERT (!in_interrupt()); - - read_lock (&kpr_rwlock); - - if (ne->kpne_shutdown) { /* caller is shutting down */ - read_unlock (&kpr_rwlock); - return (-ENOENT); - } - - /* Search routes for one that has a gateway to target_nid on the callers network */ - - list_for_each (e, &kpr_routes) { - re = list_entry (e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > target_nid || - re->kpre_hi_nid < target_nid) - continue; - - /* found table entry */ - - if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid || - !re->kpre_gateway->kpge_alive) { - /* different NAL or gateway down */ - rc = -EHOSTUNREACH; - continue; - } - - if (ge == NULL || - kpr_ge_isbetter (re->kpre_gateway, ge)) - ge = re->kpre_gateway; - } - - if (ge != NULL) { - kpr_update_weight (ge, nob); - *gateway_nidp = ge->kpge_nid; - rc = 0; - } - - read_unlock (&kpr_rwlock); - - /* NB can't deref 're' now; it might have been removed! */ - - CDEBUG (D_NET, "lookup "LPX64" from NAL %x: %d ("LPX64")\n", - target_nid, ne->kpne_interface.kprni_nalid, rc, - (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); - return (rc); -} - -kpr_nal_entry_t * -kpr_find_nal_entry_locked (int nal_id) -{ - struct list_head *e; - - /* Called with kpr_rwlock held */ - - list_for_each (e, &kpr_nals) { - kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list); - - if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */ - continue; - - return (ne); - } - - return (NULL); -} - -void -kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; - ptl_nid_t target_nid = fwd->kprfd_target_nid; - int nob = fwd->kprfd_nob; - kpr_gateway_entry_t *ge = NULL; - kpr_nal_entry_t *dst_ne = NULL; - struct list_head *e; - kpr_route_entry_t *re; - kpr_nal_entry_t *tmp_ne; - int rc; - - CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid); - - LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); - LASSERT (!in_interrupt()); - - read_lock (&kpr_rwlock); - - kpr_fwd_packets++; /* (loose) stats accounting */ - kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); - - if (src_ne->kpne_shutdown) { /* caller is shutting down */ - rc = -ESHUTDOWN; - goto out; - } - - fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ - - /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ - - list_for_each (e, &kpr_routes) { - re = list_entry (e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > target_nid || /* no match */ - re->kpre_hi_nid < target_nid) - continue; - - if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid) - continue; /* don't route to same NAL */ - - if (!re->kpre_gateway->kpge_alive) - continue; /* gateway is dead */ - - tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid); - - if (tmp_ne == NULL || - tmp_ne->kpne_shutdown) { - /* NAL must be registered and not shutting down */ - continue; - } - - if (ge == NULL || - kpr_ge_isbetter (re->kpre_gateway, ge)) { - ge = re->kpre_gateway; - dst_ne = tmp_ne; - } - } - - if (ge != NULL) { - LASSERT (dst_ne != NULL); - - kpr_update_weight (ge, nob); - - fwd->kprfd_gateway_nid = ge->kpge_nid; - atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */ - atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */ - atomic_inc (&kpr_queue_depth); - - read_unlock (&kpr_rwlock); - - CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x: " - "to "LPX64" on NAL %x\n", - fwd, target_nid, src_ne->kpne_interface.kprni_nalid, - fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); - - dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); - return; - } - - rc = -EHOSTUNREACH; - out: - kpr_fwd_errors++; - - CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %x: %d\n", - fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc); - - (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc); - - read_unlock (&kpr_rwlock); -} - -void -kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) -{ - kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; - - CDEBUG (D_NET, "complete(1) [%p] from NAL %x to NAL %x: %d\n", fwd, - src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); - - atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ - - (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); - - CDEBUG (D_NET, "complete(2) [%p] from NAL %x: %d\n", fwd, - src_ne->kpne_interface.kprni_nalid, error); - - atomic_dec (&kpr_queue_depth); - atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ -} - -int -kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid) -{ - unsigned long flags; - struct list_head *e; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; - int dup = 0; - - CDEBUG(D_NET, "Add route: %x "LPX64" : "LPX64" - "LPX64"\n", - gateway_nalid, gateway_nid, lo_nid, hi_nid); - - if (gateway_nalid == PTL_NID_ANY || - lo_nid == PTL_NID_ANY || - hi_nid == PTL_NID_ANY || - lo_nid > hi_nid) - return (-EINVAL); - - PORTAL_ALLOC (ge, sizeof (*ge)); - if (ge == NULL) - return (-ENOMEM); - - ge->kpge_nalid = gateway_nalid; - ge->kpge_nid = gateway_nid; - ge->kpge_alive = 1; - ge->kpge_timestamp = 0; - ge->kpge_refcount = 0; - atomic_set (&ge->kpge_weight, 0); - - PORTAL_ALLOC (re, sizeof (*re)); - if (re == NULL) { - PORTAL_FREE (ge, sizeof (*ge)); - return (-ENOMEM); - } - - re->kpre_lo_nid = lo_nid; - re->kpre_hi_nid = hi_nid; - - LASSERT(!in_interrupt()); - write_lock_irqsave (&kpr_rwlock, flags); - - list_for_each (e, &kpr_gateways) { - kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, - kpge_list); - - if (ge2->kpge_nalid == gateway_nalid && - ge2->kpge_nid == gateway_nid) { - PORTAL_FREE (ge, sizeof (*ge)); - ge = ge2; - dup = 1; - break; - } - } - - if (!dup) { - /* Adding a new gateway... */ - list_add (&ge->kpge_list, &kpr_gateways); - - /* ...zero all gateway weights so this one doesn't have to - * play catch-up */ - - list_for_each (e, &kpr_gateways) { - kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge2->kpge_weight, 0); - } - } - - re->kpre_gateway = ge; - ge->kpge_refcount++; - list_add (&re->kpre_list, &kpr_routes); - kpr_routes_generation++; - - write_unlock_irqrestore (&kpr_rwlock, flags); - return (0); -} - -int -kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, - int alive, time_t when) -{ - return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); -} - -int -kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, - ptl_nid_t lo, ptl_nid_t hi) -{ - int specific = (lo != PTL_NID_ANY); - unsigned long flags; - int rc = -ENOENT; - struct list_head *e; - struct list_head *n; - - CDEBUG(D_NET, "Del route [%x] "LPX64" : "LPX64" - "LPX64"\n", - gw_nalid, gw_nid, lo, hi); - - LASSERT(!in_interrupt()); - - /* NB Caller may specify either all routes via the given gateway - * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are - * actual NIDs) */ - if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY)) - return (-EINVAL); - - write_lock_irqsave(&kpr_rwlock, flags); - - list_for_each_safe (e, n, &kpr_routes) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - kpr_gateway_entry_t *ge = re->kpre_gateway; - - if (ge->kpge_nalid != gw_nalid || - ge->kpge_nid != gw_nid || - (specific && - (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid))) - continue; - - rc = 0; - - if (--ge->kpge_refcount == 0) { - list_del (&ge->kpge_list); - PORTAL_FREE (ge, sizeof (*ge)); - } - - list_del (&re->kpre_list); - PORTAL_FREE(re, sizeof (*re)); - - if (specific) - break; - } - - kpr_routes_generation++; - write_unlock_irqrestore(&kpr_rwlock, flags); - - return (rc); -} - -int -kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive) -{ - struct list_head *e; - - LASSERT (!in_interrupt()); - read_lock(&kpr_rwlock); - - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - kpr_gateway_entry_t *ge = re->kpre_gateway; - - if (idx-- == 0) { - *gateway_nalid = ge->kpge_nalid; - *gateway_nid = ge->kpge_nid; - *alive = ge->kpge_alive; - *lo_nid = re->kpre_lo_nid; - *hi_nid = re->kpre_hi_nid; - - read_unlock(&kpr_rwlock); - return (0); - } - } - - read_unlock (&kpr_rwlock); - return (-ENOENT); -} - -static int -kpr_nal_cmd(struct portals_cfg *pcfg, void * private) -{ - int err = -EINVAL; - ENTRY; - - switch(pcfg->pcfg_command) { - default: - CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command); - break; - - case NAL_CMD_ADD_ROUTE: - CDEBUG(D_IOCTL, "Adding route: [%x] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; - - case NAL_CMD_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing routes via [%x] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; - - case NAL_CMD_NOTIFY_ROUTER: { - CDEBUG (D_IOCTL, "Notifying peer [%x] "LPU64" %s @ %ld\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags ? "Enabling" : "Disabling", - (time_t)pcfg->pcfg_nid3); - - err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3); - break; - } - - case NAL_CMD_GET_ROUTE: - CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count); - err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal, - &pcfg->pcfg_nid, - &pcfg->pcfg_nid2, &pcfg->pcfg_nid3, - &pcfg->pcfg_flags); - break; - } - RETURN(err); -} - - -static void /*__exit*/ -kpr_finalise (void) -{ - LASSERT (list_empty (&kpr_nals)); - - libcfs_nal_cmd_unregister(ROUTER); - - PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); - - kpr_proc_fini(); - - while (!list_empty (&kpr_routes)) { - kpr_route_entry_t *re = list_entry(kpr_routes.next, - kpr_route_entry_t, - kpre_list); - - list_del(&re->kpre_list); - PORTAL_FREE(re, sizeof (*re)); - } - - CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", - atomic_read(&portal_kmemory)); -} - -static int __init -kpr_initialise (void) -{ - int rc; - - CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", - atomic_read(&portal_kmemory)); - - kpr_routes_generation = 0; - kpr_proc_init(); - - rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL); - if (rc != 0) { - CERROR("Can't register nal cmd handler\n"); - return (rc); - } - - PORTAL_SYMBOL_REGISTER(kpr_router_interface); - return (0); -} - -MODULE_AUTHOR("Eric Barton"); -MODULE_DESCRIPTION("Kernel Portals Router v0.01"); -MODULE_LICENSE("GPL"); - -module_init (kpr_initialise); -module_exit (kpr_finalise); - -EXPORT_SYMBOL (kpr_router_interface); diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h deleted file mode 100644 index 27e4983..0000000 --- a/lustre/portals/router/router.h +++ /dev/null @@ -1,105 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef _KPTLROUTER_H -#define _KPTLROUTER_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_PTLROUTER - -#include -#include -#include -#include - -typedef struct -{ - struct list_head kpne_list; - kpr_nal_interface_t kpne_interface; - atomic_t kpne_refcount; - int kpne_shutdown; -} kpr_nal_entry_t; - -typedef struct -{ - struct list_head kpge_list; - atomic_t kpge_weight; - time_t kpge_timestamp; - int kpge_alive; - int kpge_nalid; - int kpge_refcount; - ptl_nid_t kpge_nid; -} kpr_gateway_entry_t; - -typedef struct -{ - struct list_head kpre_list; - kpr_gateway_entry_t *kpre_gateway; - ptl_nid_t kpre_lo_nid; - ptl_nid_t kpre_hi_nid; -} kpr_route_entry_t; - -typedef struct -{ - work_struct_t kpru_tq; - int kpru_nal_id; - ptl_nid_t kpru_nid; - int kpru_alive; - time_t kpru_when; -} kpr_upcall_t; - -extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); -extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, - ptl_nid_t *gateway_nidp); -extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id); -extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); -extern void kpr_nal_notify (void *arg, ptl_nid_t peer, - int alive, time_t when); -extern void kpr_shutdown_nal (void *arg); -extern void kpr_deregister_nal (void *arg); - -extern void kpr_proc_init (void); -extern void kpr_proc_fini (void); - -extern unsigned int kpr_routes_generation; -extern unsigned long long kpr_fwd_bytes; -extern unsigned long kpr_fwd_packets; -extern unsigned long kpr_fwd_errors; -extern atomic_t kpr_queue_depth; - -extern struct list_head kpr_routes; -extern rwlock_t kpr_rwlock; - -#endif /* _KPLROUTER_H */ diff --git a/lustre/portals/tests/.cvsignore b/lustre/portals/tests/.cvsignore deleted file mode 100644 index e034130..0000000 --- a/lustre/portals/tests/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -Makefile -.deps -.*.cmd -autoMakefile.in -autoMakefile -*.ko -*.mod.c -.*.flags -.tmp_versions -.depend diff --git a/lustre/portals/tests/Makefile.in b/lustre/portals/tests/Makefile.in deleted file mode 100644 index c309db0..0000000 --- a/lustre/portals/tests/Makefile.in +++ /dev/null @@ -1,16 +0,0 @@ -MODULES := pingsrv pingcli spingsrv spingcli -pingsrv-objs := ping_srv.o - -ifeq ($(PATCHLEVEL),6) -pingcli-objs := ping_cli.o -spingsrv-objs := sping_srv.o -spingcli-objs := sping_cli.o -else -ping%.c: ping_%.c - ln -sf $< $@ - -sping%.c: sping_%.c - ln -sf $< $@ -endif - -@INCLUDE_RULES@ diff --git a/lustre/portals/tests/Makefile.mk b/lustre/portals/tests/Makefile.mk deleted file mode 100644 index 751c0a0..0000000 --- a/lustre/portals/tests/Makefile.mk +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -include $(src)/../Kernelenv - -obj-y += ping_cli.o -obj-y += ping_srv.o diff --git a/lustre/portals/tests/autoMakefile.am b/lustre/portals/tests/autoMakefile.am deleted file mode 100644 index 5f81b93..0000000 --- a/lustre/portals/tests/autoMakefile.am +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -if MODULES -if !CRAY_PORTALS -if TESTS -noinst_DATA := pingsrv$(KMODEXT) pingcli$(KMODEXT) -noinst_DATA += spingsrv$(KMODEXT) spingcli$(KMODEXT) -endif -endif -endif - -MOSTLYCLEANFILES = *.o *.ko *.mod.c pingsrv.c pingcli.c spingsrv.c spingcli.c -DIST_SOURCES = ping_srv.c ping_cli.c sping_srv.c sping_cli.c ping.h diff --git a/lustre/portals/tests/ping.h b/lustre/portals/tests/ping.h deleted file mode 100644 index f07444b..0000000 --- a/lustre/portals/tests/ping.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef _KPING_INCLUDED -#define _KPING_INCLUDED - -#include - - -#define PTL_PING_IN_SIZE 256 // n packets per buffer -#define PTL_PING_IN_BUFFERS 2 // n fallback buffers - -#define PTL_PING_CLIENT 4 -#define PTL_PING_SERVER 5 - -#define PING_HEADER_MAGIC 0xDEADBEEF -#define PING_BULK_MAGIC 0xCAFEBABE - -#define PING_HEAD_BITS 0x00000001 -#define PING_BULK_BITS 0x00000002 -#define PING_IGNORE_BITS 0xFFFFFFFC - -#define PTL_PING_ACK 0x01 -#define PTL_PING_VERBOSE 0x02 -#define PTL_PING_VERIFY 0x04 -#define PTL_PING_PREALLOC 0x08 - - -#define NEXT_PRIMARY_BUFFER(index) \ - (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) - -#define PDEBUG(str, err) \ - CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) - - -/* Ping data to be passed via the ioctl to kernel space */ - -#if __KERNEL__ - - -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -#include -#else -#include -#endif -struct pingsrv_data { - - ptl_handle_ni_t ni; - ptl_handle_me_t me; - ptl_handle_eq_t eq; - void *in_buf; - ptl_process_id_t my_id; - ptl_process_id_t id_local; - ptl_md_t mdin; - ptl_md_t mdout; - ptl_handle_md_t mdin_h; - ptl_handle_md_t mdout_h; - ptl_event_t evnt; - struct task_struct *tsk; -}; /* struct pingsrv_data */ - -struct pingcli_data { - - struct portal_ioctl_data *args; - ptl_handle_me_t me; - ptl_handle_eq_t eq; - char *inbuf; - char *outbuf; - ptl_process_id_t myid; - ptl_process_id_t id_local; - ptl_process_id_t id_remote; - ptl_md_t md_in_head; - ptl_md_t md_out_head; - ptl_handle_md_t md_in_head_h; - ptl_handle_md_t md_out_head_h; - ptl_event_t ev; - struct task_struct *tsk; -}; /* struct pingcli_data */ - - -#endif /* __KERNEL__ */ - -#endif /* _KPING_INCLUDED */ diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c deleted file mode 100644 index e9a8481..0000000 --- a/lustre/portals/tests/ping_cli.c +++ /dev/null @@ -1,303 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Kedar Sovani (kedar@calsoftinc.com) - * Amey Inamdar (amey@calsoftinc.com) - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include -#include -#include -#include -#include "ping.h" -/* int portal_debug = D_PING_CLI; */ - - -#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) - -#define MAX_TIME 100000 - -/* This should be enclosed in a structure */ - -static struct pingcli_data *client = NULL; - -static int count = 0; - -static void -pingcli_shutdown(ptl_handle_ni_t nih, int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (client->md_out_head_h))) - PDEBUG ("PtlMDUnlink", rc); - case 2: - if ((rc = PtlMDUnlink (client->md_in_head_h))) - PDEBUG ("PtlMDUnlink", rc); - - /* Free the event queue */ - if ((rc = PtlEQFree (client->eq))) - PDEBUG ("PtlEQFree", rc); - - if ((rc = PtlMEUnlink (client->me))) - PDEBUG ("PtlMEUnlink", rc); - case 3: - PtlNIFini(nih); - - case 4: - /* Free our buffers */ - - if (client != NULL) - PORTAL_FREE (client, - sizeof(struct pingcli_data)); - } - - - CDEBUG (D_OTHER, "ping client released resources\n"); -} /* pingcli_shutdown() */ - -static void pingcli_callback(ptl_event_t *ev) -{ - int i, magic; - i = *(int *)(ev->md.start + ev->offset + sizeof(unsigned)); - magic = *(int *)(ev->md.start + ev->offset); - - if(magic != 0xcafebabe) { - CERROR("Unexpected response %x\n", magic); - } - - if((i == count) || !count) - wake_up_process (client->tsk); - else - CERROR("Received response after timeout for %d\n",i); -} - - -static struct pingcli_data * -pingcli_start(struct portal_ioctl_data *args) -{ - ptl_handle_ni_t nih = PTL_INVALID_HANDLE; - unsigned ping_head_magic = PING_HEADER_MAGIC; - unsigned ping_bulk_magic = PING_BULK_MAGIC; - int rc; - struct timeval tv1, tv2; - char str[PTL_NALFMT_SIZE]; - - client->tsk = current; - client->args = args; - CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s), \ - nal %x, size %u, count: %u, timeout: %u\n", - args->ioc_nid, - portals_nid2str(args->ioc_nal, args->ioc_nid, str), - args->ioc_nal, args->ioc_size, - args->ioc_count, args->ioc_timeout); - - - PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; - if (client->outbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - PORTAL_ALLOC (client->inbuf, - (args->ioc_size + STDSIZE) * args->ioc_count); - if (client->inbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); - if (rc != PTL_OK || rc != PTL_IFACE_DUP) - { - CERROR ("NAL %x not loaded\n", args->ioc_nal); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (nih, &client->myid))) - { - CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Setup the local match entries */ - client->id_local.nid = PTL_NID_ANY; - client->id_local.pid = PTL_PID_ANY; - - /* Setup the remote match entries */ - client->id_remote.nid = args->ioc_nid; - client->id_remote.pid = 0; - - if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, - client->id_local, 0, ~0, PTL_RETAIN, - PTL_INS_AFTER, &client->me))) - { - CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) - { - CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - count = args->ioc_count; - - client->md_in_head.start = client->inbuf; - client->md_in_head.length = (args->ioc_size + STDSIZE) - * count; - client->md_in_head.threshold = PTL_MD_THRESH_INF; - client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_in_head.user_ptr = NULL; - client->md_in_head.eq_handle = client->eq; - memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); - - /* Attach the incoming buffer */ - if ((rc = PtlMDAttach (client->me, client->md_in_head, - PTL_UNLINK, &client->md_in_head_h))) { - CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); - } - /* Setup the outgoing ping header */ - client->md_out_head.start = client->outbuf; - client->md_out_head.length = STDSIZE + args->ioc_size; - client->md_out_head.threshold = args->ioc_count; - client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_out_head.user_ptr = NULL; - client->md_out_head.eq_handle = PTL_EQ_NONE; - - memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); - - count = 0; - - /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (nih, client->md_out_head, - PTL_UNLINK, &client->md_out_head_h))) { - CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (nih, 1); - return NULL; - } - while ((args->ioc_count - count)) { - memcpy (client->outbuf + sizeof(unsigned), - &(count), sizeof(unsigned)); - /* Put the ping packet */ - do_gettimeofday (&tv1); - - memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, - sizeof(struct timeval)); - - if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, - client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { - PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (nih, 1); - return NULL; - } - CWARN ("Lustre: sent msg no %d", count); - - set_current_state (TASK_INTERRUPTIBLE); - rc = schedule_timeout (20 * args->ioc_timeout); - if (rc == 0) { - CERROR ("timeout .....\n"); - } else { - do_gettimeofday (&tv2); - CWARN("Reply in %u usec\n", - (unsigned)((tv2.tv_sec - tv1.tv_sec) - * 1000000 + (tv2.tv_usec - tv1.tv_usec))); - } - count++; - } - - if (client->outbuf != NULL) - PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); - - if (client->inbuf != NULL) - PORTAL_FREE (client->inbuf, - (args->ioc_size + STDSIZE) * args->ioc_count); - - pingcli_shutdown (nih, 2); - - /* Success! */ - return NULL; -} /* pingcli_setup() */ - - - -/* called by the portals_ioctl for ping requests */ -int kping_client(struct portal_ioctl_data *args) -{ - PORTAL_ALLOC (client, sizeof(struct pingcli_data)); - if (client == NULL) - { - CERROR ("Unable to allocate client structure\n"); - return (0); - } - memset (client, 0, sizeof(struct pingcli_data)); - pingcli_start (args); - - return 0; -} /* kping_client() */ - - -static int __init pingcli_init(void) -{ - PORTAL_SYMBOL_REGISTER(kping_client); - return 0; -} /* pingcli_init() */ - - -static void /*__exit*/ pingcli_cleanup(void) -{ - PORTAL_SYMBOL_UNREGISTER (kping_client); -} /* pingcli_cleanup() */ - - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingcli_init); -module_exit(pingcli_cleanup); - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -EXPORT_SYMBOL (kping_client); -#endif diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c deleted file mode 100644 index 49e82af..0000000 --- a/lustre/portals/tests/ping_srv.c +++ /dev/null @@ -1,308 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Amey Inamdar - * Kedar Sovani - * - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include "ping.h" - -#include -#include -#include -#include -#include -#include -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#include -#else -#include -#endif -#include -#include - -#include -#include - -#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) -#define MAXSIZE (16*1024) - -static unsigned ping_head_magic; -static unsigned ping_bulk_magic; -static int nal = SOCKNAL; // Your NAL, -static unsigned long packets_valid = 0; // Valid packets -static int running = 1; -atomic_t pkt; - -static struct pingsrv_data *server=NULL; // Our ping server - -static void *pingsrv_shutdown(int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (server->mdin_h))) - PDEBUG ("PtlMDUnlink (out head buffer)", rc); - case 2: - /* Free the event queue */ - if ((rc = PtlEQFree (server->eq))) - PDEBUG ("PtlEQFree", rc); - - /* Unlink the client portal from the ME list */ - if ((rc = PtlMEUnlink (server->me))) - PDEBUG ("PtlMEUnlink", rc); - - case 3: - PtlNIFini (server->ni); - - case 4: - - case 5: - if (server->in_buf != NULL) - PORTAL_FREE (server->in_buf, MAXSIZE); - - if (server != NULL) - PORTAL_FREE (server, - sizeof (struct pingsrv_data)); - - } - - CDEBUG (D_OTHER, "ping sever resources released\n"); - return NULL; -} /* pingsrv_shutdown() */ - - -int pingsrv_thread(void *arg) -{ - int rc; - unsigned long magic; - unsigned long ping_bulk_magic = 0xcafebabe; - - kportal_daemonize ("pingsrv"); - server->tsk = current; - - while (running) { - set_current_state (TASK_INTERRUPTIBLE); - if (atomic_read (&pkt) == 0) { - schedule_timeout (MAX_SCHEDULE_TIMEOUT); - continue; - } - - magic = *((int *)(server->evnt.md.start - + server->evnt.offset)); - - - if(magic != 0xdeadbeef) { - CERROR("Unexpected Packet to the server\n"); - - } - memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); - - server->mdout.length = server->evnt.rlength; - server->mdout.start = server->in_buf; - server->mdout.threshold = 1; - server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdout.user_ptr = NULL; - server->mdout.eq_handle = PTL_EQ_NONE; - - /* Bind the outgoing buffer */ - if ((rc = PtlMDBind (server->ni, server->mdout, - PTL_UNLINK, &server->mdout_h))) { - PDEBUG ("PtlMDBind", rc); - pingsrv_shutdown (1); - return 1; - } - - - server->mdin.start = server->in_buf; - server->mdin.length = MAXSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, - server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) - PDEBUG ("PtlPut", rc); - - atomic_dec (&pkt); - - } - pingsrv_shutdown (1); - running = 1; - return 0; -} - -static void pingsrv_packet(ptl_event_t *ev) -{ - atomic_inc (&pkt); - wake_up_process (server->tsk); -} /* pingsrv_head() */ - -static void pingsrv_callback(ptl_event_t *ev) -{ - - if (ev == NULL) { - CERROR ("null in callback, ev=%p\n", ev); - return; - } - server->evnt = *ev; - - CWARN ("received ping from nid "LPX64" " - "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", - ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - *((int *)(ev->md.start + ev->offset)), - *((int *)(ev->md.start + ev->offset + sizeof(unsigned))), - *((int *)(ev->md.start + ev->offset + 2 * - sizeof(unsigned)))); - - packets_valid++; - - pingsrv_packet(ev); - -} /* pingsrv_callback() */ - - -static struct pingsrv_data *pingsrv_setup(void) -{ - int rc; - - server->ni = PTL_INVALID_HANDLE; - - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); - if (!(rc == PTL_OK || rc == PTL_IFACE_DUP)) { - CDEBUG (D_OTHER, "NAL %x not loaded\n", nal); - return pingsrv_shutdown (4); - } - - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (server->ni, &server->my_id))) { - PDEBUG ("PtlGetId", rc); - return pingsrv_shutdown (2); - } - - server->id_local.nid = PTL_NID_ANY; - server->id_local.pid = PTL_PID_ANY; - - /* Attach a match entries for header packets */ - if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, - server->id_local,0, ~0, - PTL_RETAIN, PTL_INS_AFTER, &server->me))) { - PDEBUG ("PtlMEAttach", rc); - return pingsrv_shutdown (2); - } - - - if ((rc = PtlEQAlloc (server->ni, 1024, &pingsrv_callback, - &server->eq))) { - PDEBUG ("PtlEQAlloc (callback)", rc); - return pingsrv_shutdown (2); - } - - PORTAL_ALLOC (server->in_buf, MAXSIZE); - if(!server->in_buf){ - CDEBUG (D_OTHER,"Allocation error\n"); - return pingsrv_shutdown(2); - } - - /* Setup the incoming buffer */ - server->mdin.start = server->in_buf; - server->mdin.length = MAXSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - memset (server->in_buf, 0, STDSIZE); - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - /* Success! */ - return server; -} /* pingsrv_setup() */ - -static int pingsrv_start(void) -{ - /* Setup our server */ - if (!pingsrv_setup()) { - CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); - return -ENOMEM; - } - kernel_thread (pingsrv_thread,NULL,0); - return 0; -} /* pingsrv_start() */ - - - -static int __init pingsrv_init(void) -{ - ping_head_magic = PING_HEADER_MAGIC; - ping_bulk_magic = PING_BULK_MAGIC; - PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); - return pingsrv_start (); -} /* pingsrv_init() */ - - -static void /*__exit*/ pingsrv_cleanup(void) -{ - remove_proc_entry ("net/pingsrv", NULL); - - running = 0; - wake_up_process (server->tsk); - while (running != 1) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - -} /* pingsrv_cleanup() */ - - -MODULE_PARM(nal, "i"); -MODULE_PARM_DESC(nal, "Use the specified NAL " - "(2-ksocknal, 1-kqswnal)"); - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A kernel space ping server for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingsrv_init); -module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c deleted file mode 100644 index d9970e7..0000000 --- a/lustre/portals/tests/sping_cli.c +++ /dev/null @@ -1,279 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Kedar Sovani (kedar@calsoftinc.com) - * Amey Inamdar (amey@calsoftinc.com) - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -/* This is a striped down version of pinger. It follows a single - * request-response protocol. Doesn't do Bulk data pinging. Also doesn't - * send multiple packets in a single ioctl. - */ - - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include -#include -#include -#include -#include "ping.h" -/* int portal_debug = D_PING_CLI; */ - - -#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes - assumed */ - -/* This should be enclosed in a structure */ - -static struct pingcli_data *client = NULL; - -static int count = 0; - -static void -pingcli_shutdown(ptl_handle_ni_t nih, int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (client->md_out_head_h))) - PDEBUG ("PtlMDUnlink", rc); - case 2: - /* Free the event queue */ - if ((rc = PtlEQFree (client->eq))) - PDEBUG ("PtlEQFree", rc); - - if ((rc = PtlMEUnlink (client->me))) - PDEBUG ("PtlMEUnlink", rc); - case 3: - PtlNIFini (nih); - - case 4: - /* Free our buffers */ - if (client->outbuf != NULL) - PORTAL_FREE (client->outbuf, STDSIZE); - - if (client->inbuf != NULL) - PORTAL_FREE (client->inbuf, STDSIZE); - - - if (client != NULL) - PORTAL_FREE (client, - sizeof(struct pingcli_data)); - } - - - CDEBUG (D_OTHER, "ping client released resources\n"); -} /* pingcli_shutdown() */ - -static void pingcli_callback(ptl_event_t *ev) -{ - wake_up_process (client->tsk); -} - - -static struct pingcli_data * -pingcli_start(struct portal_ioctl_data *args) -{ - ptl_handle_ni_t nih = PTL_INVALID_HANDLE; - unsigned ping_head_magic = PING_HEADER_MAGIC; - char str[PTL_NALFMT_SIZE]; - int rc; - - client->tsk = current; - client->args = args; - - CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s), \ - nal %x, size %u, count: %u, timeout: %u\n", - args->ioc_nid, - portals_nid2str(args->ioc_nid, args->ioc_nal, str), - args->ioc_nal, args->ioc_size, - args->ioc_count, args->ioc_timeout); - - - PORTAL_ALLOC (client->outbuf, STDSIZE) ; - if (client->outbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - PORTAL_ALLOC (client->inbuf, STDSIZE); - - if (client->inbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) - { - CERROR ("NAL %x not loaded.\n", args->ioc_nal); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (nih, &client->myid))) - { - CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Setup the local match entries */ - client->id_local.nid = PTL_NID_ANY; - client->id_local.pid = PTL_PID_ANY; - - /* Setup the remote match entries */ - client->id_remote.nid = args->ioc_nid; - client->id_remote.pid = 0; - - if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, - client->id_local, 0, ~0, PTL_RETAIN, - PTL_INS_AFTER, &client->me))) - { - CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) - { - CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - - client->md_in_head.start = client->inbuf; - client->md_in_head.length = STDSIZE; - client->md_in_head.threshold = 1; - client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_in_head.user_ptr = NULL; - client->md_in_head.eq_handle = client->eq; - memset (client->inbuf, 0, STDSIZE); - - /* Attach the incoming buffer */ - if ((rc = PtlMDAttach (client->me, client->md_in_head, - PTL_UNLINK, &client->md_in_head_h))) { - CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); - } - - /* Setup the outgoing ping header */ - client->md_out_head.start = client->outbuf; - client->md_out_head.length = STDSIZE; - client->md_out_head.threshold = 1; - client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_out_head.user_ptr = NULL; - client->md_out_head.eq_handle = PTL_EQ_NONE; - - memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); - - /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (nih, client->md_out_head, - PTL_UNLINK, &client->md_out_head_h))) { - CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); - } - /* Put the ping packet */ - if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, - client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { - PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (nih, 1); - return NULL; - } - - count = 0; - set_current_state (TASK_INTERRUPTIBLE); - rc = schedule_timeout (20 * args->ioc_timeout); - if (rc == 0) { - CERROR ("Time out on the server\n"); - pingcli_shutdown (nih, 2); - return NULL; - } else { - CWARN("Received respose from the server \n"); - } - - pingcli_shutdown (nih, 2); - - /* Success! */ - return NULL; -} /* pingcli_setup() */ - - - -/* called by the portals_ioctl for ping requests */ -int kping_client(struct portal_ioctl_data *args) -{ - - PORTAL_ALLOC (client, sizeof(struct pingcli_data)); - memset (client, 0, sizeof(struct pingcli_data)); - if (client == NULL) - { - CERROR ("Unable to allocate client structure\n"); - return (0); - } - pingcli_start (args); - - return 0; -} /* kping_client() */ - - -static int __init pingcli_init(void) -{ - PORTAL_SYMBOL_REGISTER(kping_client); - return 0; -} /* pingcli_init() */ - - -static void /*__exit*/ pingcli_cleanup(void) -{ - PORTAL_SYMBOL_UNREGISTER (kping_client); -} /* pingcli_cleanup() */ - - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingcli_init); -module_exit(pingcli_cleanup); - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -EXPORT_SYMBOL (kping_client); -#endif diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c deleted file mode 100644 index 069423d..0000000 --- a/lustre/portals/tests/sping_srv.c +++ /dev/null @@ -1,294 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Amey Inamdar - * Kedar Sovani - * - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* This is a striped down version of pinger. It follows a single - * request-response protocol. Doesn't do Bulk data pinging. Also doesn't - * send multiple packets in a single ioctl. - */ - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include "ping.h" - -#include -#include -#include -#include -#include -#include -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#include -#else -#include -#endif -#include -#include - -#include -#include - -#define STDSIZE (sizeof(int) + sizeof(int) + 4) - -static int nal = PTL_IFACE_DEFAULT; // Your NAL, -static unsigned long packets_valid = 0; // Valid packets -static int running = 1; -atomic_t pkt; - -static struct pingsrv_data *server=NULL; // Our ping server - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#endif - -static void *pingsrv_shutdown(int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (server->mdin_h))) - PDEBUG ("PtlMDUnlink (out head buffer)", rc); - case 2: - /* Free the event queue */ - if ((rc = PtlEQFree (server->eq))) - PDEBUG ("PtlEQFree", rc); - - /* Unlink the client portal from the ME list */ - if ((rc = PtlMEUnlink (server->me))) - PDEBUG ("PtlMEUnlink", rc); - - case 3: - PtlNIFini(server->ni); - - case 4: - - if (server->in_buf != NULL) - PORTAL_FREE (server->in_buf, STDSIZE); - - if (server != NULL) - PORTAL_FREE (server, - sizeof (struct pingsrv_data)); - - } - - CDEBUG (D_OTHER, "ping sever resources released\n"); - return NULL; -} /* pingsrv_shutdown() */ - - -int pingsrv_thread(void *arg) -{ - int rc; - - kportal_daemonize ("pingsrv"); - server->tsk = current; - - while (running) { - set_current_state (TASK_INTERRUPTIBLE); - if (atomic_read (&pkt) == 0) { - schedule_timeout (MAX_SCHEDULE_TIMEOUT); - continue; - } - - server->mdout.start = server->in_buf; - server->mdout.length = STDSIZE; - server->mdout.threshold = 1; - server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdout.user_ptr = NULL; - server->mdout.eq_handle = PTL_EQ_NONE; - - /* Bind the outgoing buffer */ - if ((rc = PtlMDBind (server->ni, server->mdout, - PTL_UNLINK, &server->mdout_h))) { - PDEBUG ("PtlMDBind", rc); - pingsrv_shutdown (1); - return 1; - } - - - server->mdin.start = server->in_buf; - server->mdin.length = STDSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, - server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) - PDEBUG ("PtlPut", rc); - - atomic_dec (&pkt); - - } - pingsrv_shutdown (1); - running = 1; - return 0; -} - -static void pingsrv_packet(ptl_event_t *ev) -{ - atomic_inc (&pkt); - wake_up_process (server->tsk); -} /* pingsrv_head() */ - -static void pingsrv_callback(ptl_event_t *ev) -{ - - if (ev == NULL) { - CERROR ("null in callback, ev=%p\n", ev); - return; - } - server->evnt = *ev; - - CWARN("Lustre: received ping from nid "LPX64" " - "(off=%u rlen=%u mlen=%u head=%x)\n", - ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - *((int *)(ev->md.start + ev->offset))); - - packets_valid++; - - pingsrv_packet(ev); - -} /* pingsrv_callback() */ - - -static struct pingsrv_data *pingsrv_setup(void) -{ - int rc; - - /* Aquire and initialize the proper nal for portals. */ - server->ni = PTL_INVALID_HANDLE; - - rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - CDEBUG (D_OTHER, "Nal %x not loaded.\n", nal); - return pingsrv_shutdown (4); - } - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (server->ni, &server->my_id))) { - PDEBUG ("PtlGetId", rc); - return pingsrv_shutdown (2); - } - - server->id_local.nid = PTL_NID_ANY; - server->id_local.pid = PTL_PID_ANY; - - /* Attach a match entries for header packets */ - if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, - server->id_local,0, ~0, - PTL_RETAIN, PTL_INS_AFTER, &server->me))) { - PDEBUG ("PtlMEAttach", rc); - return pingsrv_shutdown (2); - } - - - if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, - &server->eq))) { - PDEBUG ("PtlEQAlloc (callback)", rc); - return pingsrv_shutdown (2); - } - - PORTAL_ALLOC (server->in_buf, STDSIZE); - if(!server->in_buf){ - CDEBUG (D_OTHER,"Allocation error\n"); - return pingsrv_shutdown(2); - } - - /* Setup the incoming buffer */ - server->mdin.start = server->in_buf; - server->mdin.length = STDSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - memset (server->in_buf, 0, STDSIZE); - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - /* Success! */ - return server; -} /* pingsrv_setup() */ - -static int pingsrv_start(void) -{ - /* Setup our server */ - if (!pingsrv_setup()) { - CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); - return -ENOMEM; - } - kernel_thread (pingsrv_thread,NULL,0); - return 0; -} /* pingsrv_start() */ - - - -static int __init pingsrv_init(void) -{ - PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); - return pingsrv_start (); -} /* pingsrv_init() */ - - -static void /*__exit*/ pingsrv_cleanup(void) -{ - remove_proc_entry ("net/pingsrv", NULL); - - running = 0; - wake_up_process (server->tsk); - while (running != 1) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - -} /* pingsrv_cleanup() */ - - -MODULE_PARM(nal, "i"); -MODULE_PARM_DESC(nal, "Use the specified NAL " - "(2-ksocknal, 1-kqswnal)"); - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A kernel space ping server for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingsrv_init); -module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/startclient.sh b/lustre/portals/tests/startclient.sh deleted file mode 100755 index be60509..0000000 --- a/lustre/portals/tests/startclient.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -SIMPLE=${SIMPLE:-0} - -if [ $SIMPLE -eq 0 ]; then - PING=pingcli.o -else - PING=spingcli.o -fi - -case "$1" in - tcp) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../socknal/ksocknal.o - /sbin/insmod ./$PING - echo ksocknal > /tmp/nal - ;; - - elan) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../qswnal/kqswnal.o - /sbin/insmod ./$PING - echo kqswnal > /tmp/nal - ;; - - gm) - /sbin/insmod portals - /sbin/insmod kgmnal - /sbin/insmod ./$PING - echo kgmnal > /tmp/nal - ;; - - *) - echo "Usage : ${0} < tcp | elan | gm>" - exit 1; -esac -exit 0; diff --git a/lustre/portals/tests/startserver.sh b/lustre/portals/tests/startserver.sh deleted file mode 100755 index 9b5ccf6..0000000 --- a/lustre/portals/tests/startserver.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/sh - -SIMPLE=${SIMPLE:-0} - -if [ $SIMPLE -eq 0 ]; then - PING=pingsrv.o -else - PING=spingsrv.o -fi - -case "$1" in - tcp) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../socknal/ksocknal.o - /sbin/insmod ./$PING nal=2 - echo ksocknal > /tmp/nal - ;; - - elan) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../qswnal/kqswnal.o - /sbin/insmod ./$PING nal=4 - echo kqswnal > /tmp/nal - ;; - - gm) - /sbin/insmod portals - /sbin/insmod kgmnal - /sbin/insmod ./$PING nal=3 - echo kgmnal > /tmp/nal - ;; - - *) - echo "Usage : ${0} < tcp | elan | gm>" - exit 1; -esac -../utils/acceptor 9999& -exit 0; diff --git a/lustre/portals/tests/stopclient.sh b/lustre/portals/tests/stopclient.sh deleted file mode 100755 index f7e3aa1..0000000 --- a/lustre/portals/tests/stopclient.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -SIMPLE=${SIMPLE:-1} - -if [ $SIMPLE -eq 0 ]; then - PING=spingcli -else - PING=pingcli -fi - -rmmod $PING -NAL=`cat /tmp/nal`; -rmmod $NAL -rmmod portals diff --git a/lustre/portals/tests/stopserver.sh b/lustre/portals/tests/stopserver.sh deleted file mode 100644 index 3e81831..0000000 --- a/lustre/portals/tests/stopserver.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh - -SIMPLE=${SIMPLE:-1} - -if [ $SIMPLE -eq 0 ]; then - PING=spingsrv -else - PING=pingsrv -fi - -rmmod $PING -NAL=`cat /tmp/nal`; -rmmod $NAL -killall -9 acceptor -rm -f /var/run/acceptor-9999.pid -rmmod portals diff --git a/lustre/portals/unals/.cvsignore b/lustre/portals/unals/.cvsignore deleted file mode 100644 index e995588..0000000 --- a/lustre/portals/unals/.cvsignore +++ /dev/null @@ -1,3 +0,0 @@ -.deps -Makefile -Makefile.in diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am deleted file mode 100644 index 3437d39..0000000 --- a/lustre/portals/unals/Makefile.am +++ /dev/null @@ -1,10 +0,0 @@ -if LIBLUSTRE -if !CRAY_PORTALS -noinst_LIBRARIES = libtcpnal.a -endif -endif - -noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h -libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) -libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lustre/portals/unals/README b/lustre/portals/unals/README deleted file mode 100644 index 6cb93d9..0000000 --- a/lustre/portals/unals/README +++ /dev/null @@ -1,53 +0,0 @@ -This library implements two NAL interfaces, both running over IP. -The first, tcpnal, creates TCP connections between participating -processes in order to transport the portals requests. The second, -ernal, provides a simple transport protocol which runs over -UDP datagrams. - -The interface functions return both of these values in host order for -convenience and readability. However this means that addresses -exchanged in messages between hosts of different orderings will not -function properly. - -Both NALs use the same support functions in order to schedule events -and communicate with the generic portals implementation. - - ------------------------- - | api | - |_______________________| - | lib | - |_______________________| - | ernal | |tcpnal | - |--------| |----------| - | udpsock| |connection| - |-----------------------| - | timer/select | - ------------------------- - - - These NALs uses the framework from fdnal of a pipe between the api -and library sides. This is wrapped up in the select on the library -side, and blocks on the api side. Performance could be severely -enhanced by collapsing this aritificial barrier, by using shared -memory queues, or by wiring the api layer directly to the library. - - -nid is defined as the low order 24-bits of the IP address of the -physical node left shifted by 8 plus a virtual node number of 0 -through 255 (really only 239). The virtual node number of a tcpnal -application should be specified using the environment variable -PTL_VIRTNODE. pid is now a completely arbitrary number in the -range of 0 to 255. The IP interface used can be overridden by -specifying the appropriate hostid by setting the PTL_HOSTID -environment variable. The value can be either dotted decimal -(n.n.n.n) or hex starting with "0x". -TCPNAL: - As the NAL needs to try to send to a particular nid/pid pair, it - will open up connections on demand. Because the port associated with - the connecting socket is different from the bound port, two - connections will normally be established between a pair of peers, with - data flowing from the anonymous connect (active) port to the advertised - or well-known bound (passive) port of each peer. - - Should the connection fail to open, an error is reported to the - library component, which causes the api request to fail. diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c deleted file mode 100644 index f329e2a..0000000 --- a/lustre/portals/unals/address.c +++ /dev/null @@ -1,145 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* address.c: - * this file provides functions to aquire the IP address of the node - * and translate them into a NID/PID pair which supports a static - * mapping of virtual nodes into the port range of an IP socket. -*/ - -#include -#include -#include -#include -#include -#include -#include - - -/* Function: get_node_id - * Returns: a 32 bit id for this node, actually a big-endian IP address - * - * get_node_id() determines the host name and uses the resolver to - * find out its ip address. This is fairly fragile and inflexible, but - * explicitly asking about interfaces and their addresses is very - * complicated and nonportable. - */ -static unsigned int get_node_id(void) -{ - char buffer[255]; - unsigned int x; - struct hostent *he; - char * host_envp; - - if (!(host_envp = getenv("PTL_HOSTID"))) - { - gethostname(buffer,sizeof(buffer)); - he=gethostbyname(buffer); - if (he) - x=*(unsigned int *)he->h_addr_list[0]; - else - x = 0; - return(ntohl(x)); - } - else - { - if (host_envp[1] != 'x') - { - int a, b, c, d; - sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); - return ((a<<24) | (b<<16) | (c<<8) | d); - } - else - { - long long hostid = strtoll(host_envp, 0, 0); - return((unsigned int) hostid); - } - } -} - - -/* Function: set_address - * Arugments: t: a procnal structure to populate with the request - * - * set_address performs the bit manipulations to set the nid, pid, and - * iptop8 fields of the procnal structures. - * - * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY - */ - -#ifdef DIRECT_IP_MODE -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int port; - if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; - else port=pidrequest; - t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); - t->lib_nal->libnal_ni.ni_pid.pid=port; -} -#else - -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int virtnode, in_addr, port; - ptl_pid_t pid; - - /* get and remember my node id*/ - if (!getenv("PTL_VIRTNODE")) - virtnode = 0; - else - { - int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT - >> PNAL_VNODE_SHIFT); - virtnode = atoi(getenv("PTL_VIRTNODE")); - if (virtnode > maxvnode) - { - fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", - virtnode, maxvnode); - return; - } - } - - in_addr = get_node_id(); - - t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - pid=pidrequest; - /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ -#ifdef notyet - if (pid==(unsigned short)PTL_PID_ANY) port = 0; -#endif - if (pid==(unsigned short)PTL_PID_ANY) - { - fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); - return; - } - else if (pid > PNAL_PID_MASK) - { - fprintf(stderr, "portal pid of %d is too large - max %d\n", - pid, PNAL_PID_MASK); - return; - } - else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->lib_nal->libnal_ni.ni_pid.pid=pid; -} -#endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h deleted file mode 100644 index d2f0f2c..0000000 --- a/lustre/portals/unals/bridge.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef TCPNAL_PROCBRIDGE_H -#define TCPNAL_PROCBRIDGE_H - -#include -#include - -#define PTL_IFACE_TCP 1 -#define PTL_IFACE_ER 2 -#define PTL_IFACE_SS 3 -#define PTL_IFACE_MAX 4 - -typedef struct bridge { - int alive; - lib_nal_t *lib_nal; - void *lower; - void *local; - void (*shutdown)(struct bridge *); - /* this doesn't really belong here */ - unsigned char iptop8; -} *bridge; - - -typedef int (*nal_initialize)(bridge); -extern nal_initialize nal_table[PTL_IFACE_MAX]; - -#endif diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c deleted file mode 100644 index b399fcf..0000000 --- a/lustre/portals/unals/connection.c +++ /dev/null @@ -1,508 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* connection.c: - This file provides a simple stateful connection manager which - builds tcp connections on demand and leaves them open for - future use. It also provides the machinery to allow peers - to connect to it -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif - -/* global variable: acceptor port */ -unsigned short tcpnal_acceptor_port = 988; - - -/* Function: compare_connection - * Arguments: connection c: a connection in the hash table - * ptl_process_id_t: an id to verify agains - * Returns: 1 if the connection is the one requested, 0 otherwise - * - * compare_connection() tests for collisions in the hash table - */ -static int compare_connection(void *arg1, void *arg2) -{ - connection c = arg1; - unsigned int * id = arg2; -#if 0 - return((c->ip==id[0]) && (c->port==id[1])); -#else - /* CFS specific hacking */ - return (c->ip == id[0]); -#endif -} - - -/* Function: connection_key - * Arguments: ptl_process_id_t id: an id to hash - * Returns: a not-particularily-well-distributed hash - * of the id - */ -static unsigned int connection_key(unsigned int *id) -{ -#if 0 - return(id[0]^id[1]); -#else - /* CFS specific hacking */ - return (unsigned int) id[0]; -#endif -} - - -/* Function: remove_connection - * Arguments: c: the connection to remove - */ -void remove_connection(void *arg) -{ - connection c = arg; - unsigned int id[2]; - - id[0]=c->ip; - id[1]=c->port; - hash_table_remove(c->m->connections,id); - close(c->fd); - free(c); -} - - -/* Function: read_connection: - * Arguments: c: the connection to read from - * dest: the buffer to read into - * len: the number of bytes to read - * Returns: success as 1, or failure as 0 - * - * read_connection() reads data from the connection, continuing - * to read partial results until the request is satisfied or - * it errors. TODO: this read should be covered by signal protection. - */ -int read_connection(connection c, - unsigned char *dest, - int len) -{ - int offset = 0,rc; - - if (len) { - do { -#ifndef __CYGWIN__ - rc = syscall(SYS_read, c->fd, dest+offset, len-offset); -#else - rc = recv(c->fd, dest+offset, len-offset, 0); -#endif - if (rc <= 0) { - if (errno == EINTR) { - rc = 0; - } else { - remove_connection(c); - return (0); - } - } - offset += rc; - } while (offset < len); - } - return (1); -} - -static int connection_input(void *d) -{ - connection c = d; - return((*c->m->handler)(c->m->handler_arg,c)); -} - - -/* Function: allocate_connection - * Arguments: t: tcpnal the allocation is occuring in the context of - * dest: portal endpoint address for this connection - * fd: open file descriptor for the socket - * Returns: an allocated connection structure - * - * just encompasses the action common to active and passive - * connections of allocation and placement in the global table - */ -static connection allocate_connection(manager m, - unsigned int ip, - unsigned short port, - int fd) -{ - connection c=malloc(sizeof(struct connection)); - unsigned int id[2]; - c->m=m; - c->fd=fd; - c->ip=ip; - c->port=port; - id[0]=ip; - id[1]=port; - register_io_handler(fd,READ_HANDLER,connection_input,c); - hash_table_insert(m->connections,c,id); - return(c); -} - - -/* Function: new_connection - * Arguments: t: opaque argument holding the tcpname - * Returns: 1 in order to reregister for new connection requests - * - * called when the bound service socket recieves - * a new connection request, it always accepts and - * installs a new connection - */ -static int new_connection(void *z) -{ - manager m=z; - struct sockaddr_in s; - int len=sizeof(struct sockaddr_in); - int fd=accept(m->bound,(struct sockaddr *)&s,&len); - unsigned int nid=*((unsigned int *)&s.sin_addr); - /* cfs specific hack */ - //unsigned short pid=s.sin_port; - pthread_mutex_lock(&m->conn_lock); - allocate_connection(m,htonl(nid),0/*pid*/,fd); - pthread_mutex_unlock(&m->conn_lock); - return(1); -} - -extern ptl_nid_t tcpnal_mynid; - -int -tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) -{ - int rc; - int nob; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; - - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - - memset (&hdr, 0, sizeof (hdr)); - hmv->magic = cpu_to_le32(PORTALS_PROTO_MAGIC); - hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR); - - hdr.src_nid = cpu_to_le64(tcpnal_mynid); - hdr.type = cpu_to_le32(PTL_MSG_HELLO); - - hdr.msg.hello.type = cpu_to_le32(type); - hdr.msg.hello.incarnation = cpu_to_le64(incarnation); - - /* I don't send any interface info */ - - /* Assume sufficient socket buffering for this message */ - rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); - if (rc <= 0) { - CERROR ("Error %d sending HELLO to "LPX64"\n", rc, *nid); - return (rc); - } - - rc = syscall(SYS_read, sockfd, hmv, sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading HELLO from "LPX64"\n", rc, *nid); - return (rc); - } - - if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", - cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid); - return (-EPROTO); - } - - if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { - CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from "LPX64"\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - PORTALS_PROTO_VERSION_MAJOR, - PORTALS_PROTO_VERSION_MINOR, - *nid); - return (-EPROTO); - } - -#if (PORTALS_PROTO_VERSION_MAJOR != 1) -# error "This code only understands protocol version 1.x" -#endif - /* version 1 sends magic/version as the dest_nid of a 'hello' header, - * so read the rest of it in now... */ - - rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n", - rc, *nid); - return (rc); - } - - /* ...and check we got what we expected */ - if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { - CERROR ("Expecting a HELLO hdr " - " but got type %d with %d payload from "LPX64"\n", - le32_to_cpu (hdr.type), - le32_to_cpu (hdr.payload_length), *nid); - return (-EPROTO); - } - - if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n"); - return (-EPROTO); - } - - if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = le64_to_cpu(hdr.src_nid); - } else if (*nid != le64_to_cpu (hdr.src_nid)) { - CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", - le64_to_cpu (hdr.src_nid), *nid); - return (-EPROTO); - } - - /* Ignore any interface info in the payload */ - nob = le32_to_cpu(hdr.payload_length); - if (nob > getpagesize()) { - CERROR("Unexpected HELLO payload %d from "LPX64"\n", - nob, *nid); - return (-EPROTO); - } - if (nob > 0) { - char *space = (char *)malloc(nob); - - if (space == NULL) { - CERROR("Can't allocate scratch buffer %d\n", nob); - return (-ENOMEM); - } - - rc = syscall(SYS_read, sockfd, space, nob); - if (rc <= 0) { - CERROR("Error %d skipping HELLO payload from " - LPX64"\n", rc, *nid); - return (rc); - } - } - - return (0); -} - -/* Function: force_tcp_connection - * Arguments: t: tcpnal - * dest: portals endpoint for the connection - * Returns: an allocated connection structure, either - * a pre-existing one, or a new connection - */ -connection force_tcp_connection(manager m, - unsigned int ip, - unsigned short port, - procbridge pb) -{ - connection conn; - struct sockaddr_in addr; - struct sockaddr_in locaddr; - unsigned int id[2]; - struct timeval tv; - __u64 incarnation; - - int fd; - int option; - int rc; - int rport; - ptl_nid_t peernid = PTL_NID_ANY; - - port = tcpnal_acceptor_port; - - id[0] = ip; - id[1] = port; - - pthread_mutex_lock(&m->conn_lock); - - conn = hash_table_find(m->connections, id); - if (conn) - goto out; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_addr.s_addr = INADDR_ANY; - - for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - perror("tcpnal socket failed"); - goto out; - } - - option = 1; - rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &option, sizeof(option)); - if (rc != 0) { - perror ("Can't set SO_REUSEADDR for socket"); - close(fd); - goto out; - } - - locaddr.sin_port = htons(rport); - rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == 0 || errno == EACCES) { - rc = connect(fd, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in)); - if (rc == 0) { - break; - } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { - perror("Error connecting to remote host"); - close(fd); - goto out; - } - } else if (errno != EADDRINUSE) { - perror("Error binding to privileged port"); - close(fd); - goto out; - } - close(fd); - } - - if (rport == IPPORT_RESERVED / 2) { - fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); - goto out; - } - -#if 1 - option = 1; - setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); -#endif - - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) - exit(-1); - - conn = allocate_connection(m, ip, port, fd); - - /* let nal thread know this event right away */ - if (conn) - procbridge_wakeup_nal(pb); - -out: - pthread_mutex_unlock(&m->conn_lock); - return (conn); -} - - -/* Function: bind_socket - * Arguments: t: the nal state for this interface - * port: the port to attempt to bind to - * Returns: 1 on success, or 0 on error - * - * bind_socket() attempts to allocate and bind a socket to the requested - * port, or dynamically assign one from the kernel should the port be - * zero. Sets the bound and bound_handler elements of m. - * - * TODO: The port should be an explicitly sized type. - */ -static int bind_socket(manager m,unsigned short port) -{ - struct sockaddr_in addr; - int alen=sizeof(struct sockaddr_in); - - if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) - return(0); - - bzero((char *) &addr, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = 0; - addr.sin_port = htons(port); - - if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ - perror ("tcpnal bind"); - return(0); - } - - getsockname(m->bound,(struct sockaddr *)&addr, &alen); - - m->bound_handler=register_io_handler(m->bound,READ_HANDLER, - new_connection,m); - listen(m->bound,5); - m->port=addr.sin_port; - return(1); -} - - -/* Function: shutdown_connections - * Arguments: m: the manager structure - * - * close all connections and reclaim resources - */ -void shutdown_connections(manager m) -{ - close(m->bound); - remove_io_handler(m->bound_handler); - hash_destroy_table(m->connections,remove_connection); - free(m); -} - - -/* Function: init_connections - * Arguments: t: the nal state for this interface - * port: the port to attempt to bind to - * Returns: a newly allocated manager structure, or - * zero if the fixed port could not be bound - */ -manager init_connections(unsigned short pid, - int (*input)(void *, void *), - void *a) -{ - manager m = (manager)malloc(sizeof(struct manager)); - m->connections = hash_create_table(compare_connection,connection_key); - m->handler = input; - m->handler_arg = a; - pthread_mutex_init(&m->conn_lock, 0); - - if (bind_socket(m,pid)) - return(m); - - free(m); - return(0); -} diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h deleted file mode 100644 index 343ffa6..0000000 --- a/lustre/portals/unals/connection.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#include -#include - -typedef struct manager { - table connections; - pthread_mutex_t conn_lock; /* protect connections table */ - int bound; - io_handler bound_handler; - int (*handler)(void *, void *); - void *handler_arg; - unsigned short port; -} *manager; - - -typedef struct connection { - unsigned int ip; - unsigned short port; - int fd; - manager m; -} *connection; - -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, - procbridge pb); -manager init_connections(unsigned short, int (*f)(void *, void *), void *); -void remove_connection(void *arg); -void shutdown_connections(manager m); -int read_connection(connection c, unsigned char *dest, int len); diff --git a/lustre/portals/unals/debug.c b/lustre/portals/unals/debug.c deleted file mode 100644 index b82bb2f..0000000 --- a/lustre/portals/unals/debug.c +++ /dev/null @@ -1,119 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -int smp_processor_id = 1; -char debug_file_path[1024] = "/tmp/lustre-log"; -char debug_file_name[1024]; -FILE *debug_file_fd; - -int portals_do_debug_dumplog(void *arg) -{ - printf("Look in %s\n", debug_file_name); - return 0; -} - - -void portals_debug_print(void) -{ - return; -} - - -void portals_debug_dumplog(void) -{ - printf("Look in %s\n", debug_file_name); - return; -} - - -int portals_debug_init(unsigned long bufsize) -{ - debug_file_fd = stdout; - return 0; -} - -int portals_debug_cleanup(void) -{ - return 0; //close(portals_debug_fd); -} - -int portals_debug_clear_buffer(void) -{ - return 0; -} - -int portals_debug_mark_buffer(char *text) -{ - - fprintf(debug_file_fd, "*******************************************************************************\n"); - fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); - fprintf(debug_file_fd, "*******************************************************************************\n"); - - return 0; -} - -int portals_debug_copy_to_user(char *buf, unsigned long len) -{ - return 0; -} - -/* FIXME: I'm not very smart; someone smarter should make this better. */ -void -portals_debug_msg (int subsys, int mask, char *file, const char *fn, - const int line, const char *format, ...) -{ - va_list ap; - unsigned long flags; - struct timeval tv; - int nob; - - - /* NB since we pass a non-zero sized buffer (at least) on the first - * print, we can be assured that by the end of all the snprinting, - * we _do_ have a terminated buffer, even if our message got truncated. - */ - - gettimeofday(&tv, NULL); - - nob += fprintf(debug_file_fd, - "%02x:%06x:%d:%lu.%06lu ", - subsys >> 24, mask, smp_processor_id, - tv.tv_sec, tv.tv_usec); - - nob += fprintf(debug_file_fd, - "(%s:%d:%s() %d+%ld): ", - file, line, fn, 0, - 8192 - ((unsigned long)&flags & 8191UL)); - - va_start (ap, format); - nob += fprintf(debug_file_fd, format, ap); - va_end (ap); - - -} - diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h deleted file mode 100644 index a8f916d9..0000000 --- a/lustre/portals/unals/dispatch.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -/* this file is only called dispatch.h to prevent it - from colliding with /usr/include/sys/select.h */ - -typedef struct io_handler *io_handler; - -struct io_handler{ - io_handler *last; - io_handler next; - int fd; - int type; - int (*function)(void *); - void *argument; - int disabled; -}; - - -#define READ_HANDLER 1 -#define WRITE_HANDLER 2 -#define EXCEPTION_HANDLER 4 -#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) - -io_handler register_io_handler(int fd, - int type, - int (*function)(void *), - void *arg); - -void remove_io_handler (io_handler i); -void init_unix_timer(void); -void select_timer_block(when until); -when now(void); - -/* - * hacking for CFS internal MPI testing - */ -#if !CRAY_PORTALS -#define ENABLE_SELECT_DISPATCH -#endif diff --git a/lustre/portals/unals/ipmap.h b/lustre/portals/unals/ipmap.h deleted file mode 100644 index 85b1e18..0000000 --- a/lustre/portals/unals/ipmap.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#define DIRECT_IP_MODE -#ifdef DIRECT_IP_MODE -#define PNAL_NID(in_addr, port) (in_addr) -#define PNAL_PID(pid) (pid) -#define PNAL_IP(in_addr, port) (in_addr) -#define PNAL_PORT(nid, pid) (pid) -#else - -#define PNAL_BASE_PORT 4096 -#define PNAL_HOSTID_SHIFT 24 -#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) -#define PNAL_VNODE_SHIFT 8 -#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) -#define PNAL_PID_SHIFT 8 -#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) - -#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ - << PNAL_VNODE_SHIFT) \ - | (((ntohs(port)-PNAL_BASE_PORT) >>\ - PNAL_PID_SHIFT))) -#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) - -#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ - >> PNAL_VNODE_SHIFT)\ - | (t->iptop8 << PNAL_HOSTID_SHIFT))) -#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ - << PNAL_VNODE_SHIFT) \ - | ((pid) & PNAL_PID_MASK)) \ - + PNAL_BASE_PORT)) -#endif diff --git a/lustre/portals/unals/pqtimer.c b/lustre/portals/unals/pqtimer.c deleted file mode 100644 index 98c48eb..0000000 --- a/lustre/portals/unals/pqtimer.c +++ /dev/null @@ -1,226 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* timer.c: - * this file implements a simple priority-queue based timer system. when - * combined with a file which implements now() and block(), it can - * be used to provide course-grained time-based callbacks. - */ - -#include -#include -#include - -struct timer { - void (*function)(void *); - void *arg; - when w; - int interval; - int disable; -}; - -typedef struct thunk *thunk; -struct thunk { - void (*f)(void *); - void *a; - thunk next; -}; - -extern when now(void); - -static thunk thunks; -static int internal; -static void (*block_function)(when); -static int number_of_timers; -static int size_of_pqueue; -static timer *timers; - - -static void heal(int where) -{ - int left=(where<<1); - int right=(where<<1)+1; - int min=where; - timer temp; - - if (left <= number_of_timers) - if (timers[left]->w < timers[min]->w) min=left; - if (right <= number_of_timers) - if (timers[right]->w < timers[min]->w) min=right; - if (min != where){ - temp=timers[where]; - timers[where]=timers[min]; - timers[min]=temp; - heal(min); - } -} - -static void add_pqueue(int i) -{ - timer temp; - int parent=(i>>1); - if ((i>1) && (timers[i]->w< timers[parent]->w)){ - temp=timers[i]; - timers[i]=timers[parent]; - timers[parent]=temp; - add_pqueue(parent); - } -} - -static void add_timer(timer t) -{ - if (size_of_pqueue<(number_of_timers+2)){ - int oldsize=size_of_pqueue; - timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); - memcpy(new,timers,sizeof(timer)*oldsize); - timers=new; - } - timers[++number_of_timers]=t; - add_pqueue(number_of_timers); -} - -/* Function: register_timer - * Arguments: interval: the time interval from the current time when - * the timer function should be called - * function: the function to call when the time has expired - * argument: the argument to call it with. - * Returns: a pointer to a timer structure - */ -timer register_timer(when interval, - void (*function)(void *), - void *argument) -{ - timer t=(timer)malloc(sizeof(struct timer)); - - t->arg=argument; - t->function=function; - t->interval=interval; - t->disable=0; - t->w=now()+interval; - add_timer(t); - if (!internal && (number_of_timers==1)) - block_function(t->w); - return(t); -} - -/* Function: remove_timer - * Arguments: t: - * Returns: nothing - * - * remove_timer removes a timer from the system, insuring - * that it will never be called. It does not actually - * free the timer due to reentrancy issues. - */ - -void remove_timer(timer t) -{ - t->disable=1; -} - - - -void timer_fire() -{ - timer current; - - current=timers[1]; - timers[1]=timers[number_of_timers--]; - heal(1); - if (!current->disable) { - (*current->function)(current->arg); - } - free(current); -} - -when next_timer(void) -{ - when here=now(); - - while (number_of_timers && (timers[1]->w <= here)) timer_fire(); - if (number_of_timers) return(timers[1]->w); - return(0); -} - -/* Function: timer_loop - * Arguments: none - * Returns: never - * - * timer_loop() is the blocking dispatch function for the timer. - * Is calls the block() function registered with init_timer, - * and handles associated with timers that have been registered. - */ -void timer_loop() -{ - when here; - - while (1){ - thunk z; - here=now(); - - for (z=thunks;z;z=z->next) (*z->f)(z->a); - - if (number_of_timers){ - if (timers[1]->w > here){ - (*block_function)(timers[1]->w); - } else { - timer_fire(); - } - } else { - thunk z; - for (z=thunks;z;z=z->next) (*z->f)(z->a); - (*block_function)(0); - } - } -} - - -/* Function: register_thunk - * Arguments: f: the function to call - * a: the single argument to call it with - * - * Thunk functions get called at irregular intervals, they - * should not assume when, or take a particularily long - * amount of time. Thunks are for background cleanup tasks. - */ -void register_thunk(void (*f)(void *),void *a) -{ - thunk t=(void *)malloc(sizeof(struct thunk)); - t->f=f; - t->a=a; - t->next=thunks; - thunks=t; -} - -/* Function: initialize_timer - * Arguments: block: the function to call to block for the specified interval - * - * initialize_timer() must be called before any other timer function, - * including timer_loop. - */ -void initialize_timer(void (*block)(when)) -{ - block_function=block; - number_of_timers=0; - size_of_pqueue=10; - timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); - thunks=0; -} diff --git a/lustre/portals/unals/pqtimer.h b/lustre/portals/unals/pqtimer.h deleted file mode 100644 index 11efb0e..0000000 --- a/lustre/portals/unals/pqtimer.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -typedef unsigned long long when; -when now(void); -typedef struct timer *timer; -timer register_timer(when interval, - void (*function)(void *), - void *argument); -timer register_timer_wait(void); -void remove_timer(timer); -void timer_loop(void); -void initialize_timer(void (*block)(when)); -void timer_fire(void); - - -#define HZ 0x100000000ull - - diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c deleted file mode 100644 index 6b471c0..0000000 --- a/lustre/portals/unals/procapi.c +++ /dev/null @@ -1,196 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* api.c: - * This file provides the 'api' side for the process-based nals. - * it is responsible for creating the 'library' side thread, - * and passing wrapped portals transactions to it. - * - * Along with initialization, shutdown, and transport to the library - * side, this file contains some stubs to satisfy the nal definition. - */ -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif -#include -#include -#include -#include -#include - - -/* XXX CFS workaround, to give a chance to let nal thread wake up - * from waiting in select - */ -static int procbridge_notifier_handler(void *arg) -{ - static char buf[8]; - procbridge p = (procbridge) arg; - - syscall(SYS_read, p->notifier[1], buf, sizeof(buf)); - return 1; -} - -void procbridge_wakeup_nal(procbridge p) -{ - static char buf[8]; - syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); -} - -/* Function: shutdown - * Arguments: nal: a pointer to my top side nal structure - * ni: my network interface index - * - * cleanup nal state, reclaim the lower side thread and - * its state using PTL_FINI codepoint - */ -static void procbridge_shutdown(nal_t *n) -{ - lib_nal_t *nal = n->nal_data; - bridge b=(bridge)nal->libnal_data; - procbridge p=(procbridge)b->local; - - p->nal_flags |= NAL_FLAG_STOPPING; - procbridge_wakeup_nal(p); - - do { - pthread_mutex_lock(&p->mutex); - if (p->nal_flags & NAL_FLAG_STOPPED) { - pthread_mutex_unlock(&p->mutex); - break; - } - pthread_cond_wait(&p->cond, &p->mutex); - pthread_mutex_unlock(&p->mutex); - } while (1); - - free(p); -} - - -/* forward decl */ -extern int procbridge_startup (nal_t *, ptl_pid_t, - ptl_ni_limits_t *, ptl_ni_limits_t *); - -/* api_nal - * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side lib_nal. - * TODO: should be dyanmically allocated - */ -nal_t procapi_nal = { - nal_data: NULL, - nal_ni_init: procbridge_startup, - nal_ni_fini: procbridge_shutdown, -}; - -ptl_nid_t tcpnal_mynid; - -#ifdef ENABLE_SELECT_DISPATCH -procbridge __global_procbridge = NULL; -#endif - -/* Function: procbridge_startup - * - * Arguments: pid: requested process id (port offset) - * PTL_ID_ANY not supported. - * desired: limits passed from the application - * and effectively ignored - * actual: limits actually allocated and returned - * - * Returns: portals rc - * - * initializes the tcp nal. we define unix_failure as an - * error wrapper to cut down clutter. - */ -int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - nal_init_args_t args; - - procbridge p; - bridge b; - /* XXX nal_type is purely private to tcpnal here */ - int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ - - LASSERT(nal == &procapi_nal); - - init_unix_timer(); - - b=(bridge)malloc(sizeof(struct bridge)); - p=(procbridge)malloc(sizeof(struct procbridge)); - b->local=p; - - args.nia_requested_pid = requested_pid; - args.nia_requested_limits = requested_limits; - args.nia_actual_limits = actual_limits; - args.nia_nal_type = nal_type; - args.nia_bridge = b; - args.nia_apinal = nal; - - /* init procbridge */ - pthread_mutex_init(&p->mutex,0); - pthread_cond_init(&p->cond, 0); - p->nal_flags = 0; - - /* initialize notifier */ - if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { - perror("socketpair failed"); - return PTL_FAIL; - } - - if (!register_io_handler(p->notifier[1], READ_HANDLER, - procbridge_notifier_handler, p)) { - perror("fail to register notifier handler"); - return PTL_FAIL; - } - -#ifdef ENABLE_SELECT_DISPATCH - __global_procbridge = p; -#endif - - /* create nal thread */ - if (pthread_create(&p->t, NULL, nal_thread, &args)) { - perror("nal_init: pthread_create"); - return PTL_FAIL; - } - - do { - pthread_mutex_lock(&p->mutex); - if (p->nal_flags & (NAL_FLAG_RUNNING | NAL_FLAG_STOPPED)) { - pthread_mutex_unlock(&p->mutex); - break; - } - pthread_cond_wait(&p->cond, &p->mutex); - pthread_mutex_unlock(&p->mutex); - } while (1); - - if (p->nal_flags & NAL_FLAG_STOPPED) - return PTL_FAIL; - - b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; - - return PTL_OK; -} diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h deleted file mode 100644 index 1f91ced..0000000 --- a/lustre/portals/unals/procbridge.h +++ /dev/null @@ -1,56 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef _PROCBRIDGE_H_ -#define _PROCBRIDGE_H_ - -#include -#include -#include - - -#define NAL_FLAG_RUNNING 1 -#define NAL_FLAG_STOPPING 2 -#define NAL_FLAG_STOPPED 4 - -typedef struct procbridge { - /* sync between user threads and nal thread */ - pthread_t t; - pthread_cond_t cond; - pthread_mutex_t mutex; - - /* socket pair used to notify nal thread */ - int notifier[2]; - - int nal_flags; - -} *procbridge; - -typedef struct nal_init_args { - ptl_pid_t nia_requested_pid; - ptl_ni_limits_t *nia_requested_limits; - ptl_ni_limits_t *nia_actual_limits; - int nia_nal_type; - bridge nia_bridge; - nal_t *nia_apinal; -} nal_init_args_t; - -extern void *nal_thread(void *); - - -#define PTL_INIT (LIB_MAX_DISPATCH+1) -#define PTL_FINI (LIB_MAX_DISPATCH+2) - -#define MAX_ACLS 1 -#define MAX_PTLS 128 - -extern void set_address(bridge t,ptl_pid_t pidrequest); -extern void procbridge_wakeup_nal(procbridge p); - -#endif diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c deleted file mode 100644 index 7ee7c71..0000000 --- a/lustre/portals/unals/proclib.c +++ /dev/null @@ -1,137 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* lib.c: - * This file provides the 'library' side for the process-based nals. - * it is responsible for communication with the 'api' side and - * providing service to the generic portals 'library' - * implementation. 'library' might be better termed 'communication' - * or 'kernel'. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* the following functions are stubs to satisfy the nal definition - without doing anything particularily useful*/ - -static int nal_dist(lib_nal_t *nal, - ptl_nid_t nid, - unsigned long *dist) -{ - return 0; -} - -static void check_stopping(void *z) -{ - bridge b = z; - procbridge p = b->local; - - if ((p->nal_flags & NAL_FLAG_STOPPING) == 0) - return; - - pthread_mutex_lock(&p->mutex); - p->nal_flags |= NAL_FLAG_STOPPED; - pthread_cond_broadcast(&p->cond); - pthread_mutex_unlock(&p->mutex); - - pthread_exit(0); -} - - -/* Function: nal_thread - * Arguments: z: an opaque reference to a nal control structure - * allocated and partially populated by the api level code - * Returns: nothing, and only on error or explicit shutdown - * - * This function is the entry point of the pthread initiated on - * the api side of the interface. This thread is used to handle - * asynchronous delivery to the application. - * - * We define a limit macro to place a ceiling on limits - * for syntactic convenience - */ -extern int tcpnal_init(bridge); - -nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; - -void *nal_thread(void *z) -{ - nal_init_args_t *args = (nal_init_args_t *) z; - bridge b = args->nia_bridge; - procbridge p=b->local; - int rc; - ptl_process_id_t process_id; - int nal_type; - - b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); - b->lib_nal->libnal_data=b; - b->lib_nal->libnal_map=NULL; - b->lib_nal->libnal_unmap=NULL; - b->lib_nal->libnal_dist=nal_dist; - - nal_type = args->nia_nal_type; - - /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which - * lib_init() is about to do from the process_id passed to it...*/ - set_address(b,args->nia_requested_pid); - - process_id = b->lib_nal->libnal_ni.ni_pid; - - if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); - /* initialize the generic 'library' level code */ - - rc = lib_init(b->lib_nal, args->nia_apinal, - process_id, - args->nia_requested_limits, - args->nia_actual_limits); - - /* - * Whatever the initialization returned is passed back to the - * user level code for further interpretation. We just exit if - * it is non-zero since something went wrong. - */ - /* this should perform error checking */ - pthread_mutex_lock(&p->mutex); - p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; - pthread_cond_broadcast(&p->cond); - pthread_mutex_unlock(&p->mutex); - - if (rc == PTL_OK) { - /* the thunk function is called each time the timer loop - performs an operation and returns to blocking mode. we - overload this function to inform the api side that - it may be interested in looking at the event queue */ - register_thunk(check_stopping,b); - timer_loop(); - } - return(0); -} diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c deleted file mode 100644 index 09e1542..0000000 --- a/lustre/portals/unals/select.c +++ /dev/null @@ -1,419 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* select.c: - * Provides a general mechanism for registering and dispatching - * io events through the select system call. - */ - -#ifdef sun -#include -#else -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static struct timeval beginning_of_epoch; -static io_handler io_handlers; - -/* Function: now - * - * Return: the current time in canonical units: a 64 bit number - * where the most significant 32 bits contains the number - * of seconds, and the least signficant a count of (1/(2^32))ths - * of a second. - */ -when now() -{ - struct timeval result; - - gettimeofday(&result,0); - return((((unsigned long long)result.tv_sec)<<32)| - (((unsigned long long)result.tv_usec)<<32)/1000000); -} - - -/* Function: register_io_handler - * Arguments: fd: the file descriptor of interest - * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER - * function: a function to call when io is available on fd - * arg: an opaque correlator to return to the handler - * Returns: a pointer to the io_handler structure - */ -io_handler register_io_handler(int fd, - int type, - int (*function)(void *), - void *arg) -{ - io_handler i=(io_handler)malloc(sizeof(struct io_handler)); - if ((i->fd=fd)>=0){ - i->type=type; - i->function=function; - i->argument=arg; - i->disabled=0; - i->last=&io_handlers; - if ((i->next=io_handlers)) i->next->last=&i->next; - io_handlers=i; - } - return(i); -} - -/* Function: remove_io_handler - * Arguments: i: a pointer to the handler to stop servicing - * - * remove_io_handler() doesn't actually free the handler, due - * to reentrancy problems. it just marks the handler for - * later cleanup by the blocking function. - */ -void remove_io_handler (io_handler i) -{ - i->disabled=1; -} - -static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) -{ - if (n->type & READ_HANDLER) FD_SET(n->fd, r); - if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); -} - -static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) -{ - io_handler j; - io_handler *k; - int max = 0; - - FD_ZERO(r); - FD_ZERO(w); - FD_ZERO(e); - for (k=&io_handlers;*k;){ - if ((*k)->disabled){ - j=*k; - *k=(*k)->next; - free(j); - } - if (*k) { - set_flag(*k,r,w,e); - if ((*k)->fd > max) - max = (*k)->fd; - k=&(*k)->next; - } - } - return max + 1; -} - -static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) -{ - io_handler j; - int n = 0, t; - - for (j = io_handlers; j; j = j->next) { - if (j->disabled) - continue; - - t = 0; - if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { - FD_CLR(j->fd, r); - t++; - } - if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { - FD_CLR(j->fd, w); - t++; - } - if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { - FD_CLR(j->fd, e); - t++; - } - if (t == 0) - continue; - - if (!(*j->function)(j->argument)) - j->disabled = 1; - - n += t; - } - - return n; -} - -#ifdef ENABLE_SELECT_DISPATCH - -static struct { - pthread_mutex_t mutex; - pthread_cond_t cond; - int submitted; - int nready; - int maxfd; - fd_set *rset; - fd_set *wset; - fd_set *eset; - struct timeval *timeout; - struct timeval submit_time; -} fd_extra = { - PTHREAD_MUTEX_INITIALIZER, - PTHREAD_COND_INITIALIZER, - 0, 0, 0, - NULL, NULL, NULL, NULL, -}; - -extern int liblustre_wait_event(int timeout); -extern procbridge __global_procbridge; - -/* - * this will intercept syscall select() of user apps - * such as MPI libs. - */ -int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, - struct timeval *timeout) -{ - LASSERT(fd_extra.submitted == 0); - - fd_extra.nready = 0; - fd_extra.maxfd = n; - fd_extra.rset = rset; - fd_extra.wset = wset; - fd_extra.eset = eset; - fd_extra.timeout = timeout; - - liblustre_wait_event(0); - pthread_mutex_lock(&fd_extra.mutex); - gettimeofday(&fd_extra.submit_time, NULL); - fd_extra.submitted = 1; - LASSERT(__global_procbridge); - procbridge_wakeup_nal(__global_procbridge); - -again: - if (fd_extra.submitted) - pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); - pthread_mutex_unlock(&fd_extra.mutex); - - liblustre_wait_event(0); - - pthread_mutex_lock(&fd_extra.mutex); - if (fd_extra.submitted) - goto again; - pthread_mutex_unlock(&fd_extra.mutex); - - LASSERT(fd_extra.nready >= 0); - LASSERT(fd_extra.submitted == 0); - return fd_extra.nready; -} - -static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) -{ - int i; - - LASSERT(rset); - LASSERT(wset); - LASSERT(eset); - - for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { - LASSERT(!fd_extra.rset || - !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); - LASSERT(!fd_extra.wset || - !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); - LASSERT(!fd_extra.eset || - !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); - - if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) - __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; - if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) - __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; - if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) - __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; - } - - return (fd_extra.maxfd > max ? fd_extra.maxfd : max); -} - -static inline -int timeval_ge(struct timeval *tv1, struct timeval *tv2) -{ - LASSERT(tv1 && tv2); - return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + - (tv1->tv_usec - tv2->tv_usec) >= 0); -} - -/* - * choose the most recent timeout value - */ -static struct timeval *choose_timeout(struct timeval *tv1, - struct timeval *tv2) -{ - if (!tv1) - return tv2; - else if (!tv2) - return tv1; - - if (timeval_ge(tv1, tv2)) - return tv2; - else - return tv1; -} - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) -{ - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer, *select_timeout; - int max, nready, nexec; - int fd_handling; - -again: - if (until) { - when interval; - - interval = until - now(); - timeout.tv_sec = (interval >> 32); - timeout.tv_usec = ((interval << 32) / 1000000) >> 32; - timeout_pointer = &timeout; - } else - timeout_pointer = NULL; - - fd_handling = 0; - max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); - select_timeout = timeout_pointer; - - pthread_mutex_lock(&fd_extra.mutex); - fd_handling = fd_extra.submitted; - pthread_mutex_unlock(&fd_extra.mutex); - if (fd_handling) { - max = merge_fds(max, &fds[0], &fds[1], &fds[2]); - select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); - } - - /* XXX only compile for linux */ -#if __WORDSIZE == 64 - nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], - select_timeout); -#else - nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], - select_timeout); -#endif - if (nready < 0) { - CERROR("select return err %d, errno %d\n", nready, errno); - return; - } - - if (nready) { - nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); - nready -= nexec; - } else - nexec = 0; - - /* even both nready & nexec are 0, we still need try to wakeup - * upper thread since it may have timed out - */ - if (fd_handling) { - LASSERT(nready >= 0); - - pthread_mutex_lock(&fd_extra.mutex); - if (nready) { - if (fd_extra.rset) - *fd_extra.rset = fds[0]; - if (fd_extra.wset) - *fd_extra.wset = fds[1]; - if (fd_extra.eset) - *fd_extra.eset = fds[2]; - fd_extra.nready = nready; - fd_extra.submitted = 0; - } else { - struct timeval t; - - fd_extra.nready = 0; - if (fd_extra.timeout) { - gettimeofday(&t, NULL); - if (timeval_ge(&t, &fd_extra.submit_time)) - fd_extra.submitted = 0; - } - } - - pthread_cond_signal(&fd_extra.cond); - pthread_mutex_unlock(&fd_extra.mutex); - } - - /* haven't found portals event, go back to loop if time - * is not expired */ - if (!nexec) { - if (timeout_pointer == NULL || now() >= until) - goto again; - } -} - -#else /* !ENABLE_SELECT_DISPATCH */ - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) -{ - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int max, nready; - -again: - if (until) { - when interval; - interval = until - now(); - timeout.tv_sec = (interval >> 32); - timeout.tv_usec = ((interval << 32) / 1000000) >> 32; - timeout_pointer = &timeout; - } else - timeout_pointer = NULL; - - max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); - - nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); - if (nready > 0) - execute_callbacks(&fds[0], &fds[1], &fds[2]); -} -#endif /* ENABLE_SELECT_DISPATCH */ - -/* Function: init_unix_timer() - * is called to initialize the library - */ -void init_unix_timer() -{ - io_handlers=0; - gettimeofday(&beginning_of_epoch, 0); - initialize_timer(select_timer_block); -} diff --git a/lustre/portals/unals/table.c b/lustre/portals/unals/table.c deleted file mode 100644 index 662775a..0000000 --- a/lustre/portals/unals/table.c +++ /dev/null @@ -1,264 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include - - -/* table.c: - * a very simple hash table implementation with paramerterizable - * comparison and key generation functions. it does resize - * in order to accomidate more entries, but never collapses - * the table - */ - -static table_entry *table_lookup (table t,void *comparator, - unsigned int k, - int (*compare_function)(void *, void *), - int *success) -{ - unsigned int key=k%t->size; - table_entry *i; - - for (i=&(t->entries[key]);*i;i=&((*i)->next)){ - if (compare_function && ((*i)->key==k)) - if ((*t->compare_function)((*i)->value,comparator)){ - *success=1; - return(i); - } - } - *success=0; - return(&(t->entries[key])); -} - - -static void resize_table(table t, int size) -{ - int old_size=t->size; - table_entry *old_entries=t->entries; - int i; - table_entry j,n; - table_entry *position; - int success; - - t->size=size; - t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); - memset(t->entries,0,sizeof(table_entry)*t->size); - - for (i=0;inext; - position=table_lookup(t,0,j->key,0,&success); - j->next= *position; - *position=j; - } - free(old_entries); -} - - -/* Function: key_from_int - * Arguments: int i: value to compute the key of - * Returns: the key - */ -unsigned int key_from_int(int i) -{ - return(i); -} - - -/* Function: key_from_string - * Arguments: char *s: the null terminated string - * to compute the key of - * Returns: the key - */ -unsigned int key_from_string(char *s) -{ - unsigned int result=0; - unsigned char *n; - int i; - if (!s) return(1); - for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; - return(result); -} - - -/* Function: hash_create_table - * Arguments: compare_function: a function to compare - * a table instance with a correlator - * key_function: a function to generate a 32 bit - * hash key from a correlator - * Returns: a pointer to the new table - */ -table hash_create_table (int (*compare_function)(void *, void *), - unsigned int (*key_function)(unsigned int *)) -{ - table new=(table)malloc(sizeof(struct table)); - memset(new, 0, sizeof(struct table)); - - new->compare_function=compare_function; - new->key_function=key_function; - new->number_of_entries=0; - new->size=4; - new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); - memset(new->entries,0,sizeof(table_entry)*new->size); - return(new); -} - - -/* Function: hash_table_find - * Arguments: t: a table to look in - * comparator: a value to access the table entry - * Returns: the element references to by comparator, or null - */ -void *hash_table_find (table t, void *comparator) -{ - int success; - table_entry* entry=table_lookup(t,comparator, - (*t->key_function)(comparator), - t->compare_function, - &success); - if (success) return((*entry)->value); - return(0); -} - - -/* Function: hash_table_insert - * Arguments: t: a table to insert the object - * value: the object to put in the table - * comparator: the value by which the object - * will be addressed - * Returns: nothing - */ -void hash_table_insert (table t, void *value, void *comparator) -{ - int success; - unsigned int k=(*t->key_function)(comparator); - table_entry *position=table_lookup(t,comparator,k, - t->compare_function,&success); - table_entry entry; - - if (success) { - entry = *position; - } else { - entry = (table_entry)malloc(sizeof(struct table_entry)); - memset(entry, 0, sizeof(struct table_entry)); - entry->next= *position; - *position=entry; - t->number_of_entries++; - } - entry->value=value; - entry->key=k; - if (t->number_of_entries > t->size) resize_table(t,t->size*2); -} - -/* Function: hash_table_remove - * Arguments: t: the table to remove the object from - * comparator: the index value of the object to remove - * Returns: - */ -void hash_table_remove (table t, void *comparator) -{ - int success; - table_entry temp; - table_entry *position=table_lookup(t,comparator, - (*t->key_function)(comparator), - t->compare_function,&success); - if(success) { - temp=*position; - *position=(*position)->next; - free(temp); /* the value? */ - t->number_of_entries--; - } -} - -/* Function: hash_iterate_table_entries - * Arguments: t: the table to iterate over - * handler: a function to call with each element - * of the table, along with arg - * arg: the opaque object to pass to handler - * Returns: nothing - */ -void hash_iterate_table_entries(table t, - void (*handler)(void *,void *), - void *arg) -{ - int i; - table_entry *j,*next; - - for (i=0;isize;i++) - for (j=t->entries+i;*j;j=next){ - next=&((*j)->next); - (*handler)(arg,(*j)->value); - } -} - -/* Function: hash_filter_table_entries - * Arguments: t: the table to iterate over - * handler: a function to call with each element - * of the table, along with arg - * arg: the opaque object to pass to handler - * Returns: nothing - * Notes: operations on the table inside handler are not safe - * - * filter_table_entires() calls the handler function for each - * item in the table, passing it and arg. The handler function - * returns 1 if it is to be retained in the table, and 0 - * if it is to be removed. - */ -void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) -{ - int i; - table_entry *j,*next,v; - - for (i=0;isize;i++) - for (j=t->entries+i;*j;j=next){ - next=&((*j)->next); - if (!(*handler)(arg,(*j)->value)){ - next=j; - v=*j; - *j=(*j)->next; - free(v); - t->number_of_entries--; - } - } -} - -/* Function: destroy_table - * Arguments: t: the table to free - * thunk: a function to call with each element, - * most likely free() - * Returns: nothing - */ -void hash_destroy_table(table t,void (*thunk)(void *)) -{ - table_entry j,next; - int i; - for (i=0;isize;i++) - for (j=t->entries[i];j;j=next){ - next=j->next; - if (thunk) (*thunk)(j->value); - free(j); - } - free(t->entries); - free(t); -} diff --git a/lustre/portals/unals/table.h b/lustre/portals/unals/table.h deleted file mode 100644 index 7fab586..0000000 --- a/lustre/portals/unals/table.h +++ /dev/null @@ -1,39 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef E_TABLE -#define E_TABLE - -typedef struct table_entry { - unsigned int key; - void *value; - struct table_entry *next; -} *table_entry; - - -typedef struct table { - unsigned int size; - int number_of_entries; - table_entry *entries; - int (*compare_function)(void *, void *); - unsigned int (*key_function)(unsigned int *); -} *table; - -/* table.c */ -unsigned int key_from_int(int i); -unsigned int key_from_string(char *s); -table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); -void *hash_table_find(table t, void *comparator); -void hash_table_insert(table t, void *value, void *comparator); -void hash_table_remove(table t, void *comparator); -void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); -void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); -void hash_destroy_table(table t, void (*thunk)(void *)); - -#endif diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c deleted file mode 100644 index abb6d01..0000000 --- a/lustre/portals/unals/tcpnal.c +++ /dev/null @@ -1,256 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* tcpnal.c: - This file implements the TCP-based nal by providing glue - between the connection service and the generic NAL implementation */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif - -/* Function: tcpnal_send - * Arguments: nal: pointer to my nal control block - * private: unused - * cookie: passed back to the portals library - * hdr: pointer to the portals header - * nid: destination node - * pid: destination process - * data: body of the message - * len: length of the body - * Returns: zero on success - * - * sends a packet to the peer, after insuring that a connection exists - */ -ptl_err_t tcpnal_send(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t len) -{ - connection c; - bridge b=(bridge)n->libnal_data; - struct iovec tiov[257]; - static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - ptl_err_t rc = PTL_OK; - int sysrc; - int total; - int ntiov; - int i; - - if (!(c=force_tcp_connection((manager)b->lower, - PNAL_IP(nid,b), - PNAL_PORT(nid,pid), - b->local))) - return(PTL_FAIL); - - /* TODO: these results should be checked. furthermore, provision - must be made for the SIGPIPE which is delivered when - writing on a tcp socket which has closed underneath - the application. there is a linux flag in the sendmsg - call which turns off the signally behaviour, but its - nonstandard */ - - LASSERT (niov <= 256); - - tiov[0].iov_base = hdr; - tiov[0].iov_len = sizeof(ptl_hdr_t); - ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - - pthread_mutex_lock(&send_lock); -#if 1 - for (i = total = 0; i < ntiov; i++) - total += tiov[i].iov_len; - - sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); - if (sysrc != total) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, total, errno); - rc = PTL_FAIL; - } -#else - for (i = total = 0; i <= ntiov; i++) { - rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); - - if (rc != tiov[i].iov_len) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, tiov[i].iov_len, errno); - rc = PTL_FAIL; - break; - } - total += rc; - } -#endif -#if 0 - fprintf (stderr, "sent %s total %d in %d frags\n", - hdr->type == PTL_MSG_ACK ? "ACK" : - hdr->type == PTL_MSG_PUT ? "PUT" : - hdr->type == PTL_MSG_GET ? "GET" : - hdr->type == PTL_MSG_REPLY ? "REPLY" : - hdr->type == PTL_MSG_HELLO ? "HELLO" : "UNKNOWN", - total, niov + 1); -#endif - pthread_mutex_unlock(&send_lock); - - if (rc == PTL_OK) { - /* NB the NAL only calls lib_finalize() if it returns PTL_OK - * from cb_send() */ - lib_finalize(n, private, cookie, PTL_OK); - } - - return(rc); -} - - -/* Function: tcpnal_recv - * Arguments: lib_nal_t *nal: pointer to my nal control block - * void *private: connection pointer passed through - * lib_parse() - * lib_msg_t *cookie: passed back to portals library - * user_ptr data: pointer to the destination buffer - * size_t mlen: length of the body - * size_t rlen: length of data in the network - * Returns: zero on success - * - * blocking read of the requested data. must drain out the - * difference of mainpulated and requested lengths from the network - */ -ptl_err_t tcpnal_recv(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) - -{ - struct iovec tiov[256]; - int ntiov; - int i; - - if (!niov) - goto finalize; - - LASSERT(mlen); - LASSERT(rlen); - LASSERT(rlen >= mlen); - - ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); - - /* FIXME - * 1. Is this effecient enough? change to use readv() directly? - * 2. need check return from read_connection() - * - MeiJia - */ - for (i = 0; i < ntiov; i++) - read_connection(private, tiov[i].iov_base, tiov[i].iov_len); - -finalize: - /* FIXME; we always assume success here... */ - lib_finalize(n, private, cookie, PTL_OK); - - if (mlen!=rlen){ - char *trash=malloc(rlen-mlen); - - /*TODO: check error status*/ - read_connection(private,trash,rlen-mlen); - free(trash); - } - - return(PTL_OK); -} - - -/* Function: from_connection: - * Arguments: c: the connection to read from - * Returns: whether or not to continue reading from this connection, - * expressed as a 1 to continue, and a 0 to not - * - * from_connection() is called from the select loop when i/o is - * available. It attempts to read the portals header and - * pass it to the generic library for processing. - */ -static int from_connection(void *a, void *d) -{ - connection c = d; - bridge b = a; - ptl_hdr_t hdr; - - if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->lib_nal, &hdr, c); - /*TODO: check error status*/ - return(1); - } - return(0); -} - - -static void tcpnal_shutdown(bridge b) -{ - shutdown_connections(b->lower); -} - -/* Function: PTL_IFACE_TCP - * Arguments: pid_request: desired port number to bind to - * desired: passed NAL limits structure - * actual: returned NAL limits structure - * Returns: a nal structure on success, or null on failure - */ -int tcpnal_init(bridge b) -{ - manager m; - - b->lib_nal->libnal_send=tcpnal_send; - b->lib_nal->libnal_recv=tcpnal_recv; - b->shutdown=tcpnal_shutdown; - - if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, - b->lib_nal->libnal_ni.ni_pid.pid), - from_connection,b))){ - /* TODO: this needs to shut down the - newly created junk */ - return(PTL_NAL_FAILED); - } - b->lower=m; - return(PTL_OK); -} diff --git a/lustre/portals/unals/timer.h b/lustre/portals/unals/timer.h deleted file mode 100644 index aaf39d2..0000000 --- a/lustre/portals/unals/timer.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -/* TODO: make this an explicit type when they become available */ -typedef unsigned long long when; - -typedef struct timer { - void (*function)(void *); - void *arg; - when w; - int interval; - int disable; -} *timer; - -timer register_timer(when, void (*f)(void *), void *a); -void remove_timer(timer t); -void timer_loop(void); -void initialize_timer(void); -void register_thunk(void (*f)(void *),void *a); - - -#define HZ 0x100000000ull - - diff --git a/lustre/portals/unals/utypes.h b/lustre/portals/unals/utypes.h deleted file mode 100644 index 7eca959..0000000 --- a/lustre/portals/unals/utypes.h +++ /dev/null @@ -1,12 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -typedef unsigned short uint16; -typedef unsigned long uint32; -typedef unsigned long long uint64; -typedef unsigned char uint8; diff --git a/lustre/portals/utils/.cvsignore b/lustre/portals/utils/.cvsignore deleted file mode 100644 index e2a0d44..0000000 --- a/lustre/portals/utils/.cvsignore +++ /dev/null @@ -1,10 +0,0 @@ -Makefile -Makefile.in -acceptor -debugctl -ptlctl -.deps -routerstat -wirecheck -gmnalnid -.*.cmd diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am deleted file mode 100644 index 1d9f905..0000000 --- a/lustre/portals/utils/Makefile.am +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -## $(srcdir)/../ for , ../../ for generated -#COMPILE = $(CC) -Wall -g -I$(srcdir)/../include -I../../include -#LINK = $(CC) -o $@ - -if LIBLUSTRE -noinst_LIBRARIES = libuptlctl.a -endif - -libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c -libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) -libuptlctl_a_CFLAGS = $(LLCFLAGS) - -sbin_PROGRAMS = debugctl - -lib_LIBRARIES = libptlctl.a - -libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h - -if UTILS -if !CRAY_PORTALS -sbin_PROGRAMS += acceptor ptlctl routerstat wirecheck gmnalnid -endif -endif - -acceptor_SOURCES = acceptor.c -acceptor_LDADD = $(LIBWRAP) - -wirecheck_SOURCES = wirecheck.c - -gmnalnid_SOURCES = gmnalnid.c - -ptlctl_SOURCES = ptlctl.c -ptlctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) -ptlctl_DEPENDENCIES = libptlctl.a - -routerstat_SOURCES = routerstat.c - -debugctl_SOURCES = debugctl.c -debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) -debugctl_DEPENDENCIES = libptlctl.a - diff --git a/lustre/portals/utils/Makefile.mk b/lustre/portals/utils/Makefile.mk deleted file mode 100644 index cbbe6d5..0000000 --- a/lustre/portals/utils/Makefile.mk +++ /dev/null @@ -1,6 +0,0 @@ -include $(src)/../Kernelenv - -host-progs := acceptor ptlctl -always := $(host-progs) - -ptlctl-objs := ptlctl.o $(PTLCTLOBJS) diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c deleted file mode 100644 index 524d128..0000000 --- a/lustre/portals/utils/acceptor.c +++ /dev/null @@ -1,249 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_LIBWRAP -#include -#include -#include -#endif - -#include -#include -#include -#include - -/* should get this from autoconf somehow */ -#ifndef PIDFILE_DIR -#define PIDFILE_DIR "/var/run" -#endif - -#define PROGNAME "acceptor" - -#ifdef HAVE_LIBWRAP -/* needed because libwrap declares these as externs */ -int allow_severity = LOG_INFO; -int deny_severity = LOG_WARNING; -#endif - -void create_pidfile(char *name, int port) -{ - char pidfile[1024]; - FILE *fp; - - snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", - PIDFILE_DIR, name, port); - - if ((fp = fopen(pidfile, "w"))) { - fprintf(fp, "%d\n", getpid()); - fclose(fp); - } else { - syslog(LOG_ERR, "%s: %s\n", pidfile, - strerror(errno)); - } -} - -int pidfile_exists(char *name, int port) -{ - char pidfile[1024]; - - snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", - PIDFILE_DIR, name, port); - - if (!access(pidfile, F_OK)) { - fprintf(stderr, "%s: exists, acceptor already running.\n", - pidfile); - return (1); - } - return (0); -} - -void -show_connection (int fd, __u32 net_ip) -{ - struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); - __u32 host_ip = ntohl (net_ip); - int len; - char host[1024]; - - if (h == NULL) - snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, - (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); - else - snprintf (host, sizeof(host), "%s", h->h_name); - - syslog (LOG_INFO, "Accepted host: %s\n", host); -} - -void -usage (char *myname) -{ - fprintf (stderr, - "Usage: %s [-N nal_id] [-p] [-l] port\n\n" - " -l\tKeep stdin/stdout open\n" - " -p\tAllow connections from non-privileged ports\n", - myname); - exit (1); -} - -int main(int argc, char **argv) -{ - int o, fd, rc, port, pfd; - struct sockaddr_in srvaddr; - int c; - int noclose = 0; - int nal = SOCKNAL; - int rport; - int require_privports = 1; - - while ((c = getopt (argc, argv, "N:lp")) != -1) { - switch (c) { - case 'N': - if (sscanf(optarg, "%d", &nal) != 1 || - nal < 0 || nal > NAL_MAX_NR) - usage(argv[0]); - break; - case 'l': - noclose = 1; - break; - case 'p': - require_privports = 0; - break; - default: - usage (argv[0]); - break; - } - } - - if (optind >= argc) - usage (argv[0]); - - port = atol(argv[optind++]); - - if (pidfile_exists(PROGNAME, port)) - exit(1); - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(port); - srvaddr.sin_addr.s_addr = INADDR_ANY; - - fd = socket(PF_INET, SOCK_STREAM, 0); - if (fd < 0) { - perror("opening socket"); - exit(1); - } - - o = 1; - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { - perror("Cannot set REUSEADDR socket opt"); - exit(1); - } - - rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if ( rc == -1 ) { - perror("bind: "); - exit(1); - } - - if (listen(fd, 127)) { - perror("listen: "); - exit(1); - } - fprintf(stderr, "listening on port %d\n", port); - - pfd = open("/dev/portals", O_RDWR); - if ( pfd < 0 ) { - perror("opening portals device"); - exit(1); - } - - rc = daemon(0, noclose); - if (rc < 0) { - perror("daemon(): "); - exit(1); - } - - openlog(PROGNAME, LOG_PID, LOG_DAEMON); - syslog(LOG_INFO, "started, listening on port %d\n", port); - create_pidfile(PROGNAME, port); - - while (1) { - struct sockaddr_in clntaddr; - int len = sizeof(clntaddr); - int cfd; - struct portal_ioctl_data data; - struct portals_cfg pcfg; -#ifdef HAVE_LIBWRAP - struct request_info request; -#endif - char addrstr[INET_ADDRSTRLEN]; - - cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); - if ( cfd < 0 ) { - perror("accept"); - exit(0); - continue; - } - -#ifdef HAVE_LIBWRAP - /* libwrap access control */ - request_init(&request, RQ_DAEMON, "lustre", RQ_FILE, cfd, 0); - sock_host(&request); - if (!hosts_access(&request)) { - inet_ntop(AF_INET, &clntaddr.sin_addr, - addrstr, INET_ADDRSTRLEN); - syslog(LOG_WARNING, "Unauthorized access from %s:%hd\n", - addrstr, ntohs(clntaddr.sin_port)); - close (cfd); - continue; - } -#endif - - if (require_privports && ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) { - inet_ntop(AF_INET, &clntaddr.sin_addr, - addrstr, INET_ADDRSTRLEN); - syslog(LOG_ERR, "Closing non-privileged connection from %s:%d\n", - addrstr, ntohs(clntaddr.sin_port)); - rc = close(cfd); - if (rc) - perror ("close un-privileged client failed"); - continue; - } - - show_connection (cfd, clntaddr.sin_addr.s_addr); - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); - pcfg.pcfg_nal = nal; - pcfg.pcfg_fd = cfd; - pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */ - - PORTAL_IOC_INIT(data); - data.ioc_pbuf1 = (char*)&pcfg; - data.ioc_plen1 = sizeof(pcfg); - - if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { - perror("ioctl failed"); - } else { - printf("client registered\n"); - } - rc = close(cfd); - if (rc) - perror ("close failed"); - } - - closelog(); - exit(0); - -} diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c deleted file mode 100644 index 9886a5c..0000000 --- a/lustre/portals/utils/debug.c +++ /dev/null @@ -1,833 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Some day I'll split all of this functionality into a cfs_debug module - * of its own. That day is not today. - * - */ - -#define __USE_FILE_OFFSET64 -#define _GNU_SOURCE - -#include - -#include -#ifdef HAVE_NETDB_H -#include -#endif -#include -#include -#include "ioctl.h" -#include -#include -#include -#ifndef __CYGWIN__ -# include -#endif - -#include -#include -#include -#include -#include - -#ifdef HAVE_LINUX_VERSION_H -#include - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define BUG() /* workaround for module.h includes */ -#include -#endif -#endif /* !HAVE_LINUX_VERSION_H */ - -#include - -#include -#include -#include "parser.h" - -#include - -static char rawbuf[8192]; -static char *buf = rawbuf; -static int max = 8192; -/*static int g_pfd = -1;*/ -static int subsystem_mask = ~0; -static int debug_mask = ~0; - -#define MAX_MARK_SIZE 100 - -static const char *portal_debug_subsystems[] = - {"undefined", "mdc", "mds", "osc", - "ost", "class", "log", "llite", - "rpc", "mgmt", "portals", "nal", - "pinger", "filter", "ptlbd", "echo", - "ldlm", "lov", "router", "cobd", - "sm", "asobd", "confobd", "lmv", - "cmobd", NULL}; -static const char *portal_debug_masks[] = - {"trace", "inode", "super", "ext2", - "malloc", "cache", "info", "ioctl", - "blocks", "net", "warning", "buffs", - "other", "dentry", "portals", "page", - "dlmtrace", "error", "emerg", "ha", - "rpctrace", "vfstrace", "reada", "mmap", - "config", NULL}; - -struct debug_daemon_cmd { - char *cmd; - unsigned int cmdv; -}; - -static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { - {"start", DEBUG_DAEMON_START}, - {"stop", DEBUG_DAEMON_STOP}, - {0, 0} -}; - -static int do_debug_mask(char *name, int enable) -{ - int found = 0, i; - - for (i = 0; portal_debug_subsystems[i] != NULL; i++) { - if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || - strcasecmp(name, "all_subs") == 0) { - printf("%s output from subsystem \"%s\"\n", - enable ? "Enabling" : "Disabling", - portal_debug_subsystems[i]); - if (enable) - subsystem_mask |= (1 << i); - else - subsystem_mask &= ~(1 << i); - found = 1; - } - } - for (i = 0; portal_debug_masks[i] != NULL; i++) { - if (strcasecmp(name, portal_debug_masks[i]) == 0 || - strcasecmp(name, "all_types") == 0) { - printf("%s output of type \"%s\"\n", - enable ? "Enabling" : "Disabling", - portal_debug_masks[i]); - if (enable) - debug_mask |= (1 << i); - else - debug_mask &= ~(1 << i); - found = 1; - } - } - - return found; -} - -int dbg_initialize(int argc, char **argv) -{ - return 0; -} - -int jt_dbg_filter(int argc, char **argv) -{ - int i; - - if (argc < 2) { - fprintf(stderr, "usage: %s \n", - argv[0]); - return 0; - } - - for (i = 1; i < argc; i++) - if (!do_debug_mask(argv[i], 0)) - fprintf(stderr, "Unknown subsystem or debug type: %s\n", - argv[i]); - return 0; -} - -int jt_dbg_show(int argc, char **argv) -{ - int i; - - if (argc < 2) { - fprintf(stderr, "usage: %s \n", - argv[0]); - return 0; - } - - for (i = 1; i < argc; i++) - if (!do_debug_mask(argv[i], 1)) - fprintf(stderr, "Unknown subsystem or debug type: %s\n", - argv[i]); - - return 0; -} - -static int applymask(char* procpath, int value) -{ - int rc; - char buf[64]; - int len = snprintf(buf, 64, "%d", value); - - int fd = open(procpath, O_WRONLY); - if (fd == -1) { - fprintf(stderr, "Unable to open %s: %s\n", - procpath, strerror(errno)); - return fd; - } - rc = write(fd, buf, len+1); - if (rc<0) { - fprintf(stderr, "Write to %s failed: %s\n", - procpath, strerror(errno)); - return rc; - } - close(fd); - return 0; -} - -static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) -{ - if (!dump_filename) { - applymask("/proc/sys/portals/subsystem_debug", subs_mask); - applymask("/proc/sys/portals/debug", debug_mask); - } else { - struct portals_debug_ioctl_data data; - - data.hdr.ioc_len = sizeof(data); - data.hdr.ioc_version = 0; - data.subs = subs_mask; - data.debug = debug_mask; - - dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); - } - printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", - subs_mask, debug_mask); -} - -int jt_dbg_list(int argc, char **argv) -{ - int i; - - if (argc != 2) { - fprintf(stderr, "usage: %s \n", argv[0]); - return 0; - } - - if (strcasecmp(argv[1], "subs") == 0) { - printf("Subsystems: all_subs"); - for (i = 0; portal_debug_subsystems[i] != NULL; i++) - printf(", %s", portal_debug_subsystems[i]); - printf("\n"); - } else if (strcasecmp(argv[1], "types") == 0) { - printf("Types: all_types"); - for (i = 0; portal_debug_masks[i] != NULL; i++) - printf(", %s", portal_debug_masks[i]); - printf("\n"); - } else if (strcasecmp(argv[1], "applymasks") == 0) { - applymask_all(subsystem_mask, debug_mask); - } - return 0; -} - -/* all strings nul-terminated; only the struct and hdr need to be freed */ -struct dbg_line { - struct ptldebug_header *hdr; - char *file; - char *fn; - char *text; - struct list_head chain; -}; - -/* nurr. */ -static void list_add_ordered(struct dbg_line *new, struct list_head *head) -{ - struct list_head *pos; - struct dbg_line *curr; - - list_for_each(pos, head) { - curr = list_entry(pos, struct dbg_line, chain); - - if (curr->hdr->ph_sec < new->hdr->ph_sec) - continue; - if (curr->hdr->ph_sec == new->hdr->ph_sec && - curr->hdr->ph_usec < new->hdr->ph_usec) - continue; - - list_add(&new->chain, pos->prev); - return; - } - list_add_tail(&new->chain, head); -} - -static void print_saved_records(struct list_head *list, FILE *out) -{ - struct list_head *pos, *tmp; - - list_for_each_safe(pos, tmp, list) { - struct dbg_line *line; - struct ptldebug_header *hdr; - - line = list_entry(pos, struct dbg_line, chain); - list_del(&line->chain); - - hdr = line->hdr; - fprintf(out, "%06x:%06x:%u:%u.%06Lu:%u:%u:%u:(%s:%u:%s()) %s", - hdr->ph_subsys, hdr->ph_mask, hdr->ph_cpu_id, - hdr->ph_sec, (unsigned long long)hdr->ph_usec, - hdr->ph_stack, hdr->ph_pid, hdr->ph_extern_pid, - line->file, hdr->ph_line_num, line->fn, line->text); - free(line->hdr); - free(line); - } -} - -static int parse_buffer(FILE *in, FILE *out) -{ - struct dbg_line *line; - struct ptldebug_header *hdr; - char buf[4097], *p; - int rc; - unsigned long dropped = 0, kept = 0; - struct list_head chunk_list; - - INIT_LIST_HEAD(&chunk_list); - - while (1) { - rc = fread(buf, sizeof(hdr->ph_len), 1, in); - if (rc <= 0) - break; - - hdr = (void *)buf; - if (hdr->ph_len == 0) - break; - if (hdr->ph_len > 4094) { - fprintf(stderr, "unexpected large record: %d bytes. " - "aborting.\n", - hdr->ph_len); - break; - } - - if (hdr->ph_flags & PH_FLAG_FIRST_RECORD) { - print_saved_records(&chunk_list, out); - assert(list_empty(&chunk_list)); - } - - rc = fread(buf + sizeof(hdr->ph_len), 1, - hdr->ph_len - sizeof(hdr->ph_len), in); - if (rc <= 0) - break; - - if (hdr->ph_mask && - (!(subsystem_mask & hdr->ph_subsys) || - (!(debug_mask & hdr->ph_mask)))) { - dropped++; - continue; - } - - line = malloc(sizeof(*line)); - if (line == NULL) { - fprintf(stderr, "malloc failed; printing accumulated " - "records and exiting.\n"); - break; - } - - line->hdr = malloc(hdr->ph_len + 1); - if (line->hdr == NULL) { - fprintf(stderr, "malloc failed; printing accumulated " - "records and exiting.\n"); - break; - } - - p = (void *)line->hdr; - memcpy(line->hdr, buf, hdr->ph_len); - p[hdr->ph_len] = '\0'; - - p += sizeof(*hdr); - line->file = p; - p += strlen(line->file) + 1; - line->fn = p; - p += strlen(line->fn) + 1; - line->text = p; - - list_add_ordered(line, &chunk_list); - kept++; - } - - print_saved_records(&chunk_list, out); - - printf("Debug log: %lu lines, %lu kept, %lu dropped.\n", - dropped + kept, kept, dropped); - return 0; -} - -int jt_dbg_debug_kernel(int argc, char **argv) -{ - char filename[4096]; - struct stat st; - int rc, raw = 0, fd; - FILE *in, *out = stdout; - - if (argc > 3) { - fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); - return 0; - } - - if (argc > 2) { - raw = atoi(argv[2]); - } else if (argc > 1 && (argv[1][0] == '0' || argv[1][0] == '1')) { - raw = atoi(argv[1]); - argc--; - } - - /* If we are dumping raw (which means no conversion step to ASCII) - * then dump directly to any supplied filename, otherwise this is - * just a temp file and we dump to the real file at convert time. */ - if (argc > 1 && raw) - strcpy(filename, argv[1]); - else - sprintf(filename, "/tmp/lustre-log.%lu.%u",time(NULL),getpid()); - - if (stat(filename, &st) == 0 && S_ISREG(st.st_mode)) - unlink(filename); - - fd = open("/proc/sys/portals/dump_kernel", O_WRONLY); - if (fd < 0) { - fprintf(stderr, "open(dump_kernel) failed: %s\n", - strerror(errno)); - return 1; - } - - rc = write(fd, filename, strlen(filename)); - if (rc != strlen(filename)) { - fprintf(stderr, "write(%s) failed: %s\n", filename, - strerror(errno)); - close(fd); - return 1; - } - close(fd); - - if (raw) - return 0; - - in = fopen(filename, "r"); - if (in == NULL) { - if (errno == ENOENT) /* no dump file created */ - return 0; - - fprintf(stderr, "fopen(%s) failed: %s\n", filename, - strerror(errno)); - return 1; - } - if (argc > 1) { - out = fopen(argv[1], "w"); - if (out == NULL) { - fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], - strerror(errno)); - fclose(in); - return 1; - } - } - - rc = parse_buffer(in, out); - fclose(in); - if (argc > 1) - fclose(out); - if (rc) { - fprintf(stderr, "parse_buffer failed; leaving tmp file %s " - "behind.\n", filename); - } else { - rc = unlink(filename); - if (rc) - fprintf(stderr, "dumped successfully, but couldn't " - "unlink tmp file %s: %s\n", filename, - strerror(errno)); - } - return rc; -} - -int jt_dbg_debug_file(int argc, char **argv) -{ - int fdin,fdout; - FILE *in, *out = stdout; - if (argc > 3 || argc < 2) { - fprintf(stderr, "usage: %s [output]\n", argv[0]); - return 0; - } - - fdin = open(argv[1], O_RDONLY | O_LARGEFILE); - if (fdin == -1) { - fprintf(stderr, "open(%s) failed: %s\n", argv[1], - strerror(errno)); - return 1; - } - in = fdopen(fdin, "r"); - if (in == NULL) { - fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], - strerror(errno)); - close(fdin); - return 1; - } - if (argc > 2) { - fdout = open(argv[2], O_CREAT | O_WRONLY | O_LARGEFILE); - if (fdout == -1) { - fprintf(stderr, "open(%s) failed: %s\n", argv[2], - strerror(errno)); - fclose(in); - return 1; - } - out = fdopen(fdout, "w"); - if (out == NULL) { - fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], - strerror(errno)); - fclose(in); - close(fdout); - return 1; - } - } - - return parse_buffer(in, out); -} - -static int -dbg_write_cmd(int fd, char *str) -{ - int len = strlen(str); - int rc = write(fd, str, len); - - return (rc == len ? 0 : 1); -} - -const char debug_daemon_usage[] = "usage: %s {start file [MB]|stop}\n"; -#define DAEMON_FILE "/proc/sys/portals/daemon_file" -int jt_dbg_debug_daemon(int argc, char **argv) -{ - int rc; - int fd; - - if (argc <= 1) { - fprintf(stderr, debug_daemon_usage, argv[0]); - return 1; - } - - fd = open(DAEMON_FILE, O_WRONLY); - if (fd < 0) { - fprintf(stderr, "open %s failed: %s\n", DAEMON_FILE, - strerror(errno)); - return -1; - } - - rc = -1; - if (strcasecmp(argv[1], "start") == 0) { - if (argc < 3 || argc > 4 || - (argc == 4 && strlen(argv[3]) > 5)) { - fprintf(stderr, debug_daemon_usage, argv[0]); - goto out; - } - - if (argc == 4) { - char buf[12]; - const long min_size = 10; - const long max_size = 20480; - long size; - char *end; - - size = strtoul(argv[3], &end, 0); - if (size < min_size || - size > max_size || - *end != 0) { - fprintf(stderr, "size %s invalid, must be in " - "the range %ld-%ld MB\n", argv[3], - min_size, max_size); - goto out; - } - - snprintf(buf, sizeof(buf), "size=%ld", size); - rc = dbg_write_cmd(fd, buf); - if (rc != 0) { - fprintf(stderr, "set %s failed: %s\n", - buf, strerror(errno)); - goto out; - } - } - - rc = dbg_write_cmd(fd, "start"); - if (rc != 0) { - fprintf(stderr, "start debug_daemon on %s failed: %s\n", - argv[2], strerror(errno)); - goto out; - } - - rc = 0; - goto out; - } - - if (strcasecmp(argv[1], "stop") == 0) { - rc = dbg_write_cmd(fd, "stop"); - if (rc != 0) { - fprintf(stderr, "stopping debug_daemon failed: %s\n", - strerror(errno)); - goto out; - } - - rc = 0; - goto out; - } - - fprintf(stderr, debug_daemon_usage, argv[0]); - rc = -1; -out: - close(fd); - return rc; -} - -int jt_dbg_clear_debug_buf(int argc, char **argv) -{ - int rc; - struct portal_ioctl_data data; - - if (argc != 1) { - fprintf(stderr, "usage: %s\n", argv[0]); - return 0; - } - - memset(&data, 0, sizeof(data)); - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); - return -1; - } - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); - if (rc) { - fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", - strerror(errno)); - return -1; - } - return 0; -} - -int jt_dbg_mark_debug_buf(int argc, char **argv) -{ - int rc, max_size = MAX_MARK_SIZE-1; - struct portal_ioctl_data data; - char *text; - time_t now = time(NULL); - - if (argc > 1) { - int counter; - text = malloc(MAX_MARK_SIZE); - strncpy(text, argv[1], max_size); - max_size-=strlen(argv[1]); - for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){ - strncat(text, " ", 1); - max_size-=1; - strncat(text, argv[counter], max_size); - max_size-=strlen(argv[counter]); - } - } else { - text = ctime(&now); - text[strlen(text) - 1] = '\0'; /* stupid \n */ - } - if (!max_size) { - text[MAX_MARK_SIZE - 1] = '\0'; - } - - memset(&data, 0, sizeof(data)); - data.ioc_inllen1 = strlen(text) + 1; - data.ioc_inlbuf1 = text; - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); - return -1; - } - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); - if (rc) { - fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", - strerror(errno)); - return -1; - } - return 0; -} - -static struct mod_paths { - char *name, *path; -} mod_paths[] = { - {"libcfs", "lustre/portals/libcfs"}, - {"portals", "lustre/portals/portals"}, - {"ksocknal", "lustre/portals/knals/socknal"}, - {"kptlrouter", "lustre/portals/router"}, - {"lvfs", "lustre/lvfs"}, - {"obdclass", "lustre/obdclass"}, - {"llog_test", "lustre/obdclass"}, - {"ptlrpc", "lustre/ptlrpc"}, - {"obdext2", "lustre/obdext2"}, - {"ost", "lustre/ost"}, - {"osc", "lustre/osc"}, - {"mds", "lustre/mds"}, - {"mdc", "lustre/mdc"}, - {"llite", "lustre/llite"}, - {"ldiskfs", "lustre/ldiskfs"}, - {"smfs", "lustre/smfs"}, - {"obdecho", "lustre/obdecho"}, - {"ldlm", "lustre/ldlm"}, - {"obdfilter", "lustre/obdfilter"}, - {"extN", "lustre/extN"}, - {"lov", "lustre/lov"}, - {"lmv", "lustre/lmv"}, - {"fsfilt_ext3", "lustre/lvfs"}, - {"fsfilt_extN", "lustre/lvfs"}, - {"fsfilt_reiserfs", "lustre/lvfs"}, - {"fsfilt_smfs", "lustre/lvfs"}, - {"fsfilt_ldiskfs", "lustre/lvfs"}, - {"mds_ext2", "lustre/mds"}, - {"mds_ext3", "lustre/mds"}, - {"mds_extN", "lustre/mds"}, - {"ptlbd", "lustre/ptlbd"}, - {"mgmt_svc", "lustre/mgmt"}, - {"mgmt_cli", "lustre/mgmt"}, - {"cobd", "lustre/cobd"}, - {"cmobd", "lustre/cmobd"}, - {"conf_obd", "lustre/obdclass"}, - {NULL, NULL} -}; - -static int jt_dbg_modules_2_4(int argc, char **argv) -{ -#ifdef HAVE_LINUX_VERSION_H -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - struct mod_paths *mp; - char *path = ".."; - char *kernel = "linux"; - - if (argc >= 2) - path = argv[1]; - if (argc == 3) - kernel = argv[2]; - if (argc > 3) { - printf("%s [path] [kernel]\n", argv[0]); - return 0; - } - - for (mp = mod_paths; mp->name != NULL; mp++) { - struct module_info info; - int rc; - size_t crap; - int query_module(const char *name, int which, void *buf, - size_t bufsize, size_t *ret); - - rc = query_module(mp->name, QM_INFO, &info, sizeof(info), - &crap); - if (rc < 0) { - if (errno != ENOENT) - printf("query_module(%s) failed: %s\n", - mp->name, strerror(errno)); - } else { - printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, - mp->path, mp->name, - info.addr + sizeof(struct module)); - } - } - - return 0; -#endif /* Headers are 2.6-only */ -#endif /* !HAVE_LINUX_VERSION_H */ - return -EINVAL; -} - -static int jt_dbg_modules_2_5(int argc, char **argv) -{ - struct mod_paths *mp; - char *path = ".."; - char *kernel = "linux"; - const char *proc = "/proc/modules"; - char modname[128], others[128]; - long modaddr; - int rc; - FILE *file; - - if (argc >= 2) - path = argv[1]; - if (argc == 3) - kernel = argv[2]; - if (argc > 3) { - printf("%s [path] [kernel]\n", argv[0]); - return 0; - } - - file = fopen(proc, "r"); - if (!file) { - printf("failed open %s: %s\n", proc, strerror(errno)); - return 0; - } - - while ((rc = fscanf(file, "%s %s %s %s %s %lx\n", - modname, others, others, others, others, &modaddr)) == 6) { - for (mp = mod_paths; mp->name != NULL; mp++) { - if (!strcmp(mp->name, modname)) - break; - } - if (mp->name) { - printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, - mp->path, mp->name, modaddr); - } - } - - return 0; -} - -int jt_dbg_modules(int argc, char **argv) -{ - int rc = 0; - struct utsname sysinfo; - - rc = uname(&sysinfo); - if (rc) { - printf("uname() failed: %s\n", strerror(errno)); - return 0; - } - - if (sysinfo.release[2] > '4') { - return jt_dbg_modules_2_5(argc, argv); - } else { - return jt_dbg_modules_2_4(argc, argv); - } - - return 0; -} - -int jt_dbg_panic(int argc, char **argv) -{ - int rc; - struct portal_ioctl_data data; - - if (argc != 1) { - fprintf(stderr, "usage: %s\n", argv[0]); - return 0; - } - - memset(&data, 0, sizeof(data)); - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); - return -1; - } - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); - if (rc) { - fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", - strerror(errno)); - return -1; - } - return 0; -} diff --git a/lustre/portals/utils/debugctl.c b/lustre/portals/utils/debugctl.c deleted file mode 100644 index 1b6cd96..0000000 --- a/lustre/portals/utils/debugctl.c +++ /dev/null @@ -1,66 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Some day I'll split all of this functionality into a cfs_debug module - * of its own. That day is not today. - * - */ - -#include -#include -#include -#include -#include "parser.h" - - -command_t list[] = { - {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"}, - {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file|stop], control debug daemon to dump debug buffer to a file"}, - {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file [output] [raw], read debug buffer from input and print it [to output]"}, - {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"}, - {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"}, - {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"}, - {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"}, - {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"}, - {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: )"}, - {"panic", jt_dbg_panic, 0, "cause the kernel to panic"}, - {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, - {"help", Parser_help, 0, "help"}, - {"exit", Parser_quit, 0, "quit"}, - {"quit", Parser_quit, 0, "quit"}, - { 0, 0, 0, NULL } -}; - -int main(int argc, char **argv) -{ - if (dbg_initialize(argc, argv) < 0) - exit(2); - - register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); - - Parser_init("debugctl > ", list); - if (argc > 1) - return Parser_execarg(argc - 1, &argv[1], list); - - Parser_commands(); - - unregister_ioc_dev(PORTALS_DEV_ID); - return 0; -} diff --git a/lustre/portals/utils/gmnalnid.c b/lustre/portals/utils/gmnalnid.c deleted file mode 100644 index e45fae4..0000000 --- a/lustre/portals/utils/gmnalnid.c +++ /dev/null @@ -1,117 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2003 Los Alamos National Laboratory (LANL) - * - * This file is part of Lustre, http://www.lustre.org/ - * - * This file is free software; you can redistribute it and/or - * modify it under the terms of version 2.1 of the GNU Lesser General - * Public License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define GMNAL_IOC_GET_GNID 1 - -int -roundup(int len) -{ - return((len+7) & (~0x7)); -} - -int main(int argc, char **argv) -{ - int rc, pfd; - struct portal_ioctl_data data; - struct portals_cfg pcfg; - unsigned int nid = 0, len; - char *name = NULL; - int c; - - - - while ((c = getopt(argc, argv, "n:l")) != -1) { - switch(c) { - case('n'): - name = optarg; - break; - case('l'): - printf("Get local id not implemented yet!\n"); - exit(-1); - default: - printf("usage %s -n nodename [-p]\n", argv[0]); - } - } - - if (!name) { - printf("usage %s -n nodename [-p]\n", argv[0]); - exit(-1); - } - - - - - PCFG_INIT(pcfg, GMNAL_IOC_GET_GNID); - pcfg.pcfg_nal = GMNAL; - - /* - * set up the inputs - */ - len = strlen(name) + 1; - pcfg.pcfg_pbuf1 = malloc(len); - strcpy(pcfg.pcfg_pbuf1, name); - pcfg.pcfg_plen1 = len; - - /* - * set up the outputs - */ - pcfg.pcfg_pbuf2 = (void*)&nid; - pcfg.pcfg_plen2 = sizeof(unsigned int*); - - pfd = open("/dev/portals", O_RDWR); - if ( pfd < 0 ) { - perror("opening portals device"); - free(pcfg.pcfg_pbuf1); - exit(-1); - } - - PORTAL_IOC_INIT(data); - data.ioc_pbuf1 = (char*)&pcfg; - data.ioc_plen1 = sizeof(pcfg); - - rc = ioctl (pfd, IOC_PORTAL_NAL_CMD, &data); - if (rc < 0) - { - perror ("Can't get my NID"); - } - - free(pcfg.pcfg_pbuf1); - close(pfd); - printf("%u\n", nid); - exit(0); -} diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c deleted file mode 100644 index 0671c24..0000000 --- a/lustre/portals/utils/l_ioctl.c +++ /dev/null @@ -1,339 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#define __USE_FILE_OFFSET64 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifndef __CYGWIN__ - #include -#else - #include - #include -#endif - -static ioc_handler_t do_ioctl; /* forward ref */ -static ioc_handler_t *current_ioc_handler = &do_ioctl; - -struct ioc_dev { - const char * dev_name; - int dev_fd; -}; - -static struct ioc_dev ioc_dev_list[10]; - -struct dump_hdr { - int magic; - int dev_id; - unsigned int opc; -}; - -char *dump_filename; - -void -set_ioc_handler (ioc_handler_t *handler) -{ - if (handler == NULL) - current_ioc_handler = do_ioctl; - else - current_ioc_handler = handler; -} - -static int -open_ioc_dev(int dev_id) -{ - const char * dev_name; - - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return -EINVAL; - - dev_name = ioc_dev_list[dev_id].dev_name; - if (dev_name == NULL) { - fprintf(stderr, "unknown device id: %d\n", dev_id); - return -EINVAL; - } - - if (ioc_dev_list[dev_id].dev_fd < 0) { - int fd = open(dev_name, O_RDWR); - - if (fd < 0) { - fprintf(stderr, "opening %s failed: %s\n" - "hint: the kernel modules may not be loaded\n", - dev_name, strerror(errno)); - return fd; - } - ioc_dev_list[dev_id].dev_fd = fd; - } - - return ioc_dev_list[dev_id].dev_fd; -} - - -static int -do_ioctl(int dev_id, unsigned int opc, void *buf) -{ - int fd, rc; - - fd = open_ioc_dev(dev_id); - if (fd < 0) - return fd; - - rc = ioctl(fd, opc, buf); - return rc; - -} - -static FILE * -get_dump_file() -{ - FILE *fp = NULL; - - if (!dump_filename) { - fprintf(stderr, "no dump filename\n"); - } else - fp = fopen(dump_filename, "a"); - return fp; -} - -/* - * The dump file should start with a description of which devices are - * used, but for now it will assumed whatever app reads the file will - * know what to do. */ -int -dump(int dev_id, unsigned int opc, void *buf) -{ - FILE *fp; - struct dump_hdr dump_hdr; - struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; - int rc; - - printf("dumping opc %x to %s\n", opc, dump_filename); - - - dump_hdr.magic = 0xdeadbeef; - dump_hdr.dev_id = dev_id; - dump_hdr.opc = opc; - - fp = get_dump_file(); - if (fp == NULL) { - fprintf(stderr, "%s: %s\n", dump_filename, - strerror(errno)); - return -EINVAL; - } - - rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); - if (rc == 1) - rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); - fclose(fp); - if (rc != 1) { - fprintf(stderr, "%s: %s\n", dump_filename, - strerror(errno)); - return -EINVAL; - } - - return 0; -} - -/* register a device to send ioctls to. */ -int -register_ioc_dev(int dev_id, const char * dev_name) -{ - - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return -EINVAL; - - unregister_ioc_dev(dev_id); - - ioc_dev_list[dev_id].dev_name = dev_name; - ioc_dev_list[dev_id].dev_fd = -1; - - return dev_id; -} - -void -unregister_ioc_dev(int dev_id) -{ - - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return; - if (ioc_dev_list[dev_id].dev_name != NULL && - ioc_dev_list[dev_id].dev_fd >= 0) - close(ioc_dev_list[dev_id].dev_fd); - - ioc_dev_list[dev_id].dev_name = NULL; - ioc_dev_list[dev_id].dev_fd = -1; -} - -/* If this file is set, then all ioctl buffers will be - appended to the file. */ -int -set_ioctl_dump(char * file) -{ - if (dump_filename) - free(dump_filename); - - dump_filename = strdup(file); - if (dump_filename == NULL) - abort(); - - set_ioc_handler(&dump); - return 0; -} - -int -l_ioctl(int dev_id, unsigned int opc, void *buf) -{ - return current_ioc_handler(dev_id, opc, buf); -} - -/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer - * in the file. For example: - * - * parse_dump("lctl.dump", l_ioctl); - * - * Note: if using l_ioctl, then you also need to register_ioc_dev() for - * each device used in the dump. - */ -int -parse_dump(char * dump_file, ioc_handler_t ioc_func) -{ - int line =0; - struct stat st; - char *start, *buf, *end; -#ifndef __CYGWIN__ - int fd; -#else - HANDLE fd, hmap; - DWORD size; -#endif - -#ifndef __CYGWIN__ - fd = syscall(SYS_open, dump_file, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "couldn't open %s: %s\n", dump_file, - strerror(errno)); - exit(1); - } - - if (fstat(fd, &st)) { - perror("stat fails"); - exit(1); - } - - if (st.st_size < 1) { - fprintf(stderr, "KML is empty\n"); - exit(1); - } - - start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); - end = start + st.st_size; - close(fd); - if (start == MAP_FAILED) { - fprintf(stderr, "can't create file mapping\n"); - exit(1); - } -#else - fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); - size = GetFileSize(fd, NULL); - if (size < 1) { - fprintf(stderr, "KML is empty\n"); - exit(1); - } - - hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL); - start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0); - end = buf + size; - CloseHandle(fd); - if (start == NULL) { - fprintf(stderr, "can't create file mapping\n"); - exit(1); - } -#endif /* __CYGWIN__ */ - - while (buf < end) { - struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; - struct portal_ioctl_hdr * data; - char tmp[8096]; - int rc; - - line++; - - data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); - if (buf + data->ioc_len > end ) { - fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, - data->ioc_len, end); - return -1; - } -#if 0 - printf ("dump_hdr: %lx data: %lx\n", - (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); - - printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, - data->ioc_len, data->ioc_version); -#endif - - memcpy(tmp, data, data->ioc_len); - - rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); - if (rc) { - printf("failed: %d\n", rc); - exit(1); - } - - buf += data->ioc_len + sizeof(*dump_hdr); - } - -#ifndef __CYGWIN__ - munmap(start, end - start); -#else - UnmapViewOfFile(start); - CloseHandle(hmap); -#endif - - return 0; -} - -int -jt_ioc_dump(int argc, char **argv) -{ - if (argc > 2) { - fprintf(stderr, "usage: %s [hostname]\n", argv[0]); - return 0; - } - printf("setting dumpfile to: %s\n", argv[1]); - - set_ioctl_dump(argv[1]); - return 0; -} diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c deleted file mode 100644 index b91295b..0000000 --- a/lustre/portals/utils/parser.c +++ /dev/null @@ -1,651 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef HAVE_LIBREADLINE -#define READLINE_LIBRARY -#include - -/* completion_matches() is #if 0-ed out in modern glibc */ -#ifndef completion_matches -# define completion_matches rl_completion_matches -#endif -#endif - -extern void using_history(void); -extern void stifle_history(int); -extern void add_history(char *); - -#include "parser.h" - -static command_t * top_level; /* Top level of commands, initialized by - * InitParser */ -static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ -static int done; /* Set to 1 if user types exit or quit */ - - -/* static functions */ -static char *skipwhitespace(char *s); -static char *skiptowhitespace(char *s); -static command_t *find_cmd(char *name, command_t cmds[], char **next); -static int process(char *s, char **next, command_t *lookup, command_t **result, - char **prev); -static void print_commands(char *str, command_t *table); - -static char * skipwhitespace(char * s) -{ - char * t; - int len; - - len = (int)strlen(s); - for (t = s; t <= s + len && isspace(*t); t++); - return(t); -} - - -static char * skiptowhitespace(char * s) -{ - char * t; - - for (t = s; *t && !isspace(*t); t++); - return(t); -} - -static int line2args(char *line, char **argv, int maxargs) -{ - char *arg; - int i = 0; - - arg = strtok(line, " \t"); - if ( arg ) { - argv[i] = arg; - i++; - } else - return 0; - - while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) { - argv[i] = arg; - i++; - } - return i; -} - -/* find a command -- return it if unique otherwise print alternatives */ -static command_t *Parser_findargcmd(char *name, command_t cmds[]) -{ - command_t *cmd; - - for (cmd = cmds; cmd->pc_name; cmd++) { - if (strcmp(name, cmd->pc_name) == 0) - return cmd; - } - return NULL; -} - -int Parser_execarg(int argc, char **argv, command_t cmds[]) -{ - command_t *cmd; - - cmd = Parser_findargcmd(argv[0], cmds); - if ( cmd ) { - int rc = (cmd->pc_func)(argc, argv); - if (rc == CMD_HELP) - fprintf(stderr, "%s\n", cmd->pc_help); - return rc; - } else { - printf("Try interactive use without arguments or use one of:\n"); - for (cmd = cmds; cmd->pc_name; cmd++) - printf("\"%s\" ", cmd->pc_name); - printf("\nas argument.\n"); - } - return -1; -} - -/* returns the command_t * (NULL if not found) corresponding to a - _partial_ match with the first token in name. It sets *next to - point to the following token. Does not modify *name. */ -static command_t * find_cmd(char * name, command_t cmds[], char ** next) -{ - int i, len; - - if (!cmds || !name ) - return NULL; - - /* This sets name to point to the first non-white space character, - and next to the first whitespace after name, len to the length: do - this with strtok*/ - name = skipwhitespace(name); - *next = skiptowhitespace(name); - len = *next - name; - if (len == 0) - return NULL; - - for (i = 0; cmds[i].pc_name; i++) { - if (strncasecmp(name, cmds[i].pc_name, len) == 0) { - *next = skipwhitespace(*next); - return(&cmds[i]); - } - } - return NULL; -} - -/* Recursively process a command line string s and find the command - corresponding to it. This can be ambiguous, full, incomplete, - non-existent. */ -static int process(char *s, char ** next, command_t *lookup, - command_t **result, char **prev) -{ - *result = find_cmd(s, lookup, next); - *prev = s; - - /* non existent */ - if ( ! *result ) - return CMD_NONE; - - /* found entry: is it ambigous, i.e. not exact command name and - more than one command in the list matches. Note that find_cmd - points to the first ambiguous entry */ - if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) && - find_cmd(s, (*result) + 1, next)) - return CMD_AMBIG; - - /* found a unique command: component or full? */ - if ( (*result)->pc_func ) { - return CMD_COMPLETE; - } else { - if ( *next == '\0' ) { - return CMD_INCOMPLETE; - } else { - return process(*next, next, (*result)->pc_sub_cmd, result, prev); - } - } -} - -#ifdef HAVE_LIBREADLINE -static command_t * match_tbl; /* Command completion against this table */ -static char * command_generator(const char * text, int state) -{ - static int index, - len; - char *name; - - /* Do we have a match table? */ - if (!match_tbl) - return NULL; - - /* If this is the first time called on this word, state is 0 */ - if (!state) { - index = 0; - len = (int)strlen(text); - } - - /* Return next name in the command list that paritally matches test */ - while ( (name = (match_tbl + index)->pc_name) ) { - index++; - - if (strncasecmp(name, text, len) == 0) { - return(strdup(name)); - } - } - - /* No more matches */ - return NULL; -} - -/* probably called by readline */ -static char **command_completion(char * text, int start, int end) -{ - command_t * table; - char * pos; - - match_tbl = top_level; - - for (table = find_cmd(rl_line_buffer, match_tbl, &pos); - table; table = find_cmd(pos, match_tbl, &pos)) - { - - if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; - } - - return completion_matches(text, command_generator); -} -#endif - -/* take a string and execute the function or print help */ -int execute_line(char * line) -{ - command_t *cmd, *ambig; - char *prev; - char *next, *tmp; - char *argv[MAXARGS]; - int i; - int rc = 0; - - switch( process(line, &next, top_level, &cmd, &prev) ) { - case CMD_AMBIG: - fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); - while( (ambig = find_cmd(prev, cmd, &tmp)) ) { - fprintf(stderr, "%s ", ambig->pc_name); - cmd = ambig + 1; - } - fprintf(stderr, "\n"); - break; - case CMD_NONE: - fprintf(stderr, "No such command, type help\n"); - break; - case CMD_INCOMPLETE: - fprintf(stderr, - "'%s' incomplete command. Use '%s x' where x is one of:\n", - line, line); - fprintf(stderr, "\t"); - for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) { - fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); - } - fprintf(stderr, "\n"); - break; - case CMD_COMPLETE: - i = line2args(line, argv, MAXARGS); - rc = (cmd->pc_func)(i, argv); - - if (rc == CMD_HELP) - fprintf(stderr, "%s\n", cmd->pc_help); - - break; - } - - return rc; -} - -int -noop_fn () -{ - return (0); -} - -/* just in case you're ever in an airplane and discover you - forgot to install readline-dev. :) */ -int init_input() -{ - int interactive = isatty (fileno (stdin)); - -#ifdef HAVE_LIBREADLINE - using_history(); - stifle_history(HISTORY); - - if (!interactive) - { - rl_prep_term_function = (rl_vintfunc_t *)noop_fn; - rl_deprep_term_function = (rl_voidfunc_t *)noop_fn; - } - - rl_attempted_completion_function = (CPPFunction *)command_completion; - rl_completion_entry_function = (void *)command_generator; -#endif - return interactive; -} - -#ifndef HAVE_LIBREADLINE -#define add_history(s) -char * readline(char * prompt) -{ - char line[2048]; - int n = 0; - if (prompt) - printf ("%s", prompt); - if (fgets(line, sizeof(line), stdin) == NULL) - return (NULL); - n = strlen(line); - if (n && line[n-1] == '\n') - line[n-1] = '\0'; - return strdup(line); -} -#endif - -/* this is the command execution machine */ -int Parser_commands(void) -{ - char *line, *s; - int rc = 0; - int interactive; - - interactive = init_input(); - - while(!done) { - line = readline(interactive ? parser_prompt : NULL); - - if (!line) break; - - s = skipwhitespace(line); - - if (*s) { - add_history(s); - rc = execute_line(s); - } - - free(line); - } - return rc; -} - - -/* sets the parser prompt */ -void Parser_init(char * prompt, command_t * cmds) -{ - done = 0; - top_level = cmds; - if (parser_prompt) free(parser_prompt); - parser_prompt = strdup(prompt); -} - -/* frees the parser prompt */ -void Parser_exit(int argc, char *argv[]) -{ - done = 1; - free(parser_prompt); - parser_prompt = NULL; -} - -/* convert a string to an integer */ -int Parser_int(char *s, int *val) -{ - int ret; - - if (*s != '0') - ret = sscanf(s, "%d", val); - else if (*(s+1) != 'x') - ret = sscanf(s, "%o", val); - else { - s++; - ret = sscanf(++s, "%x", val); - } - - return(ret); -} - - -void Parser_qhelp(int argc, char *argv[]) { - - printf("Available commands are:\n"); - - print_commands(NULL, top_level); - printf("For more help type: help command-name\n"); -} - -int Parser_help(int argc, char **argv) -{ - char line[1024]; - char *next, *prev, *tmp; - command_t *result, *ambig; - int i; - - if ( argc == 1 ) { - Parser_qhelp(argc, argv); - return 0; - } - - line[0]='\0'; - for ( i = 1 ; i < argc ; i++ ) { - strcat(line, argv[i]); - } - - switch ( process(line, &next, top_level, &result, &prev) ) { - case CMD_COMPLETE: - fprintf(stderr, "%s: %s\n",line, result->pc_help); - break; - case CMD_NONE: - fprintf(stderr, "%s: Unknown command.\n", line); - break; - case CMD_INCOMPLETE: - fprintf(stderr, - "'%s' incomplete command. Use '%s x' where x is one of:\n", - line, line); - fprintf(stderr, "\t"); - for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { - fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); - } - fprintf(stderr, "\n"); - break; - case CMD_AMBIG: - fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); - while( (ambig = find_cmd(prev, result, &tmp)) ) { - fprintf(stderr, "%s ", ambig->pc_name); - result = ambig + 1; - } - fprintf(stderr, "\n"); - break; - } - return 0; -} - - -void Parser_printhelp(char *cmd) -{ - char *argv[] = { "help", cmd }; - Parser_help(2, argv); -} - -/************************************************************************* - * COMMANDS * - *************************************************************************/ - - -static void print_commands(char * str, command_t * table) { - command_t * cmds; - char buf[80]; - - for (cmds = table; cmds->pc_name; cmds++) { - if (cmds->pc_func) { - if (str) printf("\t%s %s\n", str, cmds->pc_name); - else printf("\t%s\n", cmds->pc_name); - } - if (cmds->pc_sub_cmd) { - if (str) { - sprintf(buf, "%s %s", str, cmds->pc_name); - print_commands(buf, cmds->pc_sub_cmd); - } else { - print_commands(cmds->pc_name, cmds->pc_sub_cmd); - } - } - } -} - -char *Parser_getstr(const char *prompt, const char *deft, char *res, - size_t len) -{ - char *line = NULL; - int size = strlen(prompt) + strlen(deft) + 8; - char *theprompt; - theprompt = malloc(size); - assert(theprompt); - - sprintf(theprompt, "%s [%s]: ", prompt, deft); - - line = readline(theprompt); - free(theprompt); - - if ( line == NULL || *line == '\0' ) { - strncpy(res, deft, len); - } else { - strncpy(res, line, len); - } - - if ( line ) { - free(line); - return res; - } else { - return NULL; - } -} - -/* get integer from prompt, loop forever to get it */ -int Parser_getint(const char *prompt, long min, long max, long deft, int base) -{ - int rc; - long result; - char *line; - int size = strlen(prompt) + 40; - char *theprompt = malloc(size); - assert(theprompt); - sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); - - fflush(stdout); - - do { - line = NULL; - line = readline(theprompt); - if ( !line ) { - fprintf(stdout, "Please enter an integer.\n"); - fflush(stdout); - continue; - } - if ( *line == '\0' ) { - free(line); - result = deft; - break; - } - rc = Parser_arg2int(line, &result, base); - free(line); - if ( rc != 0 ) { - fprintf(stdout, "Invalid string.\n"); - fflush(stdout); - } else if ( result > max || result < min ) { - fprintf(stdout, "Error: response must lie between %ld and %ld.\n", - min, max); - fflush(stdout); - } else { - break; - } - } while ( 1 ) ; - - if (theprompt) - free(theprompt); - return result; - -} - -/* get boolean (starting with YyNn; loop forever */ -int Parser_getbool(const char *prompt, int deft) -{ - int result = 0; - char *line; - int size = strlen(prompt) + 8; - char *theprompt = malloc(size); - assert(theprompt); - - fflush(stdout); - - if ( deft != 0 && deft != 1 ) { - fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n", - deft); - assert ( 0 ); - } - sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); - - do { - line = NULL; - line = readline(theprompt); - if ( line == NULL ) { - result = deft; - break; - } - if ( *line == '\0' ) { - result = deft; - break; - } - if ( *line == 'y' || *line == 'Y' ) { - result = 1; - break; - } - if ( *line == 'n' || *line == 'N' ) { - result = 0; - break; - } - if ( line ) - free(line); - fprintf(stdout, "Invalid string. Must start with yY or nN\n"); - fflush(stdout); - } while ( 1 ); - - if ( line ) - free(line); - if ( theprompt ) - free(theprompt); - return result; -} - -/* parse int out of a string or prompt for it */ -long Parser_intarg(const char *inp, const char *prompt, int deft, - int min, int max, int base) -{ - long result; - int rc; - - rc = Parser_arg2int(inp, &result, base); - - if ( rc == 0 ) { - return result; - } else { - return Parser_getint(prompt, deft, min, max, base); - } -} - -/* parse int out of a string or prompt for it */ -char *Parser_strarg(char *inp, const char *prompt, const char *deft, - char *answer, int len) -{ - if ( inp == NULL || *inp == '\0' ) { - return Parser_getstr(prompt, deft, answer, len); - } else - return inp; -} - -/* change a string into a number: return 0 on success. No invalid characters - allowed. The processing of base and validity follows strtol(3)*/ -int Parser_arg2int(const char *inp, long *result, int base) -{ - char *endptr; - - if ( (base !=0) && (base < 2 || base > 36) ) - return 1; - - *result = strtol(inp, &endptr, base); - - if ( *inp != '\0' && *endptr == '\0' ) - return 0; - else - return 1; -} - -int Parser_quit(int argc, char **argv) -{ - argc = argc; - argv = argv; - done = 1; - return 0; -} diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h deleted file mode 100644 index 9e7e95a..0000000 --- a/lustre/portals/utils/parser.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef _PARSER_H_ -#define _PARSER_H_ - -#define HISTORY 100 /* Don't let history grow unbounded */ -#define MAXARGS 512 - -#define CMD_COMPLETE 0 -#define CMD_INCOMPLETE 1 -#define CMD_NONE 2 -#define CMD_AMBIG 3 -#define CMD_HELP 4 - -typedef struct parser_cmd { - char *pc_name; - int (* pc_func)(int, char **); - struct parser_cmd * pc_sub_cmd; - char *pc_help; -} command_t; - -typedef struct argcmd { - char *ac_name; - int (*ac_func)(int, char **); - char *ac_help; -} argcmd_t; - -typedef struct network { - char *type; - char *server; - int port; -} network_t; - -int Parser_quit(int argc, char **argv); -void Parser_init(char *, command_t *); /* Set prompt and load command list */ -int Parser_commands(void); /* Start the command parser */ -void Parser_qhelp(int, char **); /* Quick help routine */ -int Parser_help(int, char **); /* Detailed help routine */ -void Parser_printhelp(char *); /* Detailed help routine */ -void Parser_exit(int, char **); /* Shuts down command parser */ -int Parser_execarg(int argc, char **argv, command_t cmds[]); -int execute_line(char * line); - -/* Converts a string to an integer */ -int Parser_int(char *, int *); - -/* Prompts for a string, with default values and a maximum length */ -char *Parser_getstr(const char *prompt, const char *deft, char *res, - size_t len); - -/* Prompts for an integer, with minimum, maximum and default values and base */ -int Parser_getint(const char *prompt, long min, long max, long deft, - int base); - -/* Prompts for a yes/no, with default */ -int Parser_getbool(const char *prompt, int deft); - -/* Extracts an integer from a string, or prompts if it cannot get one */ -long Parser_intarg(const char *inp, const char *prompt, int deft, - int min, int max, int base); - -/* Extracts a word from the input, or propmts if it cannot get one */ -char *Parser_strarg(char *inp, const char *prompt, const char *deft, - char *answer, int len); - -/* Extracts an integer from a string with a base */ -int Parser_arg2int(const char *inp, long *result, int base); - -#endif diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c deleted file mode 100644 index 9c1537b..0000000 --- a/lustre/portals/utils/portals.c +++ /dev/null @@ -1,1935 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include -#include -#ifdef HAVE_NETDB_H -#include -#endif -#include -#ifdef HAVE_NETINET_TCP_H -#include -#endif -#include -#include -#include -#include "ioctl.h" -#include -#include -#include -#include -#include -#include -#if CRAY_PORTALS -#ifdef REDSTORM -#define __QK__ -#endif -#include -#endif - -#ifdef __CYGWIN__ - -#include - -#endif /* __CYGWIN__ */ - -#include -#include -#include -#include -#include -#include "parser.h" - -unsigned int portal_debug; -unsigned int portal_printk; - -static unsigned int g_nal = 0; - -typedef struct -{ - char *name; - int num; -} name2num_t; - -static name2num_t nalnames[] = { - {"any", 0}, -#if !CRAY_PORTALS - {"tcp", SOCKNAL}, - {"elan", QSWNAL}, - {"gm", GMNAL}, - {"openib", OPENIBNAL}, - {"iib", IIBNAL}, - {"lo", LONAL}, - {"ra", RANAL}, -#else - {"cray_kern_nal", CRAY_KERN_NAL}, - {"cray_user_nal", CRAY_USER_NAL}, - {"cray_qk_nal", CRAY_QK_NAL}, -#endif - {NULL, -1} -}; - -static cfg_record_cb_t g_record_cb; - -/* Convert a string boolean to an int; "enable" -> 1 */ -int ptl_parse_bool (int *b, char *str) { - if (!strcasecmp (str, "no") || - !strcasecmp (str, "n") || - !strcasecmp (str, "off") || - !strcasecmp (str, "down") || - !strcasecmp (str, "disable")) - { - *b = 0; - return (0); - } - - if (!strcasecmp (str, "yes") || - !strcasecmp (str, "y") || - !strcasecmp (str, "on") || - !strcasecmp (str, "up") || - !strcasecmp (str, "enable")) - { - *b = 1; - return (0); - } - - return (-1); -} - -/* Convert human readable size string to and int; "1k" -> 1000 */ -int ptl_parse_size (int *sizep, char *str) { - int size; - char mod[32]; - - switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { - default: - return (-1); - - case 1: - *sizep = size; - return (0); - - case 2: - switch (*mod) { - case 'g': - case 'G': - *sizep = size << 30; - return (0); - - case 'm': - case 'M': - *sizep = size << 20; - return (0); - - case 'k': - case 'K': - *sizep = size << 10; - return (0); - - default: - *sizep = size; - return (0); - } - } -} - -int -ptl_set_cfg_record_cb(cfg_record_cb_t cb) -{ - g_record_cb = cb; - return 0; -} - -int -pcfg_ioctl(struct portals_cfg *pcfg) -{ - int rc; - - if (pcfg->pcfg_nal ==0) - pcfg->pcfg_nal = g_nal; - - if (g_record_cb) { - rc = g_record_cb(PORTALS_CFG_TYPE, sizeof(*pcfg), pcfg); - } else { - struct portal_ioctl_data data; - PORTAL_IOC_INIT (data); - data.ioc_pbuf1 = (char*)pcfg; - data.ioc_plen1 = sizeof(*pcfg); - /* XXX liblustre hack XXX */ - data.ioc_nal_cmd = pcfg->pcfg_command; - data.ioc_nid = pcfg->pcfg_nid; - - rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - } - - return (rc); -} - - - -static name2num_t * -name2num_lookup_name (name2num_t *table, char *str) -{ - while (table->name != NULL) - if (!strcmp (str, table->name)) - return (table); - else - table++; - return (NULL); -} - -static name2num_t * -name2num_lookup_num (name2num_t *table, int num) -{ - while (table->name != NULL) - if (num == table->num) - return (table); - else - table++; - return (NULL); -} - -int -ptl_name2nal (char *str) -{ - name2num_t *e = name2num_lookup_name (nalnames, str); - - return ((e == NULL) ? -1 : e->num); -} - -static char * -nal2name (int nal) -{ - name2num_t *e = name2num_lookup_num (nalnames, nal); - - return ((e == NULL) ? "???" : e->name); -} - -#ifdef HAVE_GETHOSTBYNAME -static struct hostent * -ptl_gethostbyname(char * hname) { - struct hostent *he; - he = gethostbyname(hname); - if (!he) { - switch(h_errno) { - case HOST_NOT_FOUND: - case NO_ADDRESS: - fprintf(stderr, "Unable to resolve hostname: %s\n", - hname); - break; - default: - fprintf(stderr, "gethostbyname error: %s\n", - strerror(errno)); - break; - } - return NULL; - } - return he; -} -#endif - -int -ptl_parse_port (int *port, char *str) -{ - char *end; - - *port = strtol (str, &end, 0); - - if (*end == 0 && /* parsed whole string */ - *port > 0 && *port < 65536) /* minimal sanity check */ - return (0); - - return (-1); -} - -int -ptl_parse_time (time_t *t, char *str) -{ - char *end; - int n; - struct tm tm; - - *t = strtol (str, &end, 0); - if (*end == 0) /* parsed whole string */ - return (0); - - memset (&tm, 0, sizeof (tm)); - n = sscanf (str, "%d-%d-%d-%d:%d:%d", - &tm.tm_year, &tm.tm_mon, &tm.tm_mday, - &tm.tm_hour, &tm.tm_min, &tm.tm_sec); - if (n != 6) - return (-1); - - tm.tm_mon--; /* convert to 0 == Jan */ - tm.tm_year -= 1900; /* y2k quirk */ - tm.tm_isdst = -1; /* dunno if it's daylight savings... */ - - *t = mktime (&tm); - if (*t == (time_t)-1) - return (-1); - - return (0); -} - -int -ptl_parse_ipquad (__u32 *ipaddrp, char *str) -{ - int a; - int b; - int c; - int d; - - if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && - (a & ~0xff) == 0 && (b & ~0xff) == 0 && - (c & ~0xff) == 0 && (d & ~0xff) == 0) - { - *ipaddrp = (a<<24)|(b<<16)|(c<<8)|d; - return (0); - } - - return (-1); -} - -int -ptl_parse_ipaddr (__u32 *ipaddrp, char *str) -{ -#ifdef HAVE_GETHOSTBYNAME - struct hostent *he; -#endif - - if (!strcmp (str, "_all_")) - { - *ipaddrp = 0; - return (0); - } - - if (ptl_parse_ipquad(ipaddrp, str) == 0) - return (0); - -#if HAVE_GETHOSTBYNAME - if ((('a' <= str[0] && str[0] <= 'z') || - ('A' <= str[0] && str[0] <= 'Z')) && - (he = ptl_gethostbyname (str)) != NULL) - { - __u32 addr = *(__u32 *)he->h_addr; - - *ipaddrp = ntohl(addr); /* HOST byte order */ - return (0); - } -#endif - - return (-1); -} - -char * -ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) -{ -#ifdef HAVE_GETHOSTBYNAME - __u32 net_ip; - struct hostent *he; - - if (lookup) { - net_ip = htonl (ipaddr); - he = gethostbyaddr (&net_ip, sizeof (net_ip), AF_INET); - if (he != NULL) { - strcpy(str, he->h_name); - return (str); - } - } -#endif - - sprintf (str, "%d.%d.%d.%d", - (ipaddr >> 24) & 0xff, (ipaddr >> 16) & 0xff, - (ipaddr >> 8) & 0xff, ipaddr & 0xff); - return (str); -} - -int -ptl_parse_nid (ptl_nid_t *nidp, char *str) -{ - __u32 ipaddr; - char *end; - unsigned long long ullval; - - if (!strcmp (str, "_all_")) { - *nidp = PTL_NID_ANY; - return (0); - } - - if (ptl_parse_ipaddr (&ipaddr, str) == 0) { -#if !CRAY_PORTALS - *nidp = (ptl_nid_t)ipaddr; -#else - *nidp = (((ptl_nid_t)ipaddr & PNAL_HOSTID_MASK) << PNAL_VNODE_SHIFT); -#endif - return (0); - } - - ullval = strtoull(str, &end, 0); - if (*end == 0) { - /* parsed whole string */ - *nidp = (ptl_nid_t)ullval; - return (0); - } - - return (-1); -} - -__u64 ptl_nid2u64(ptl_nid_t nid) -{ - switch (sizeof (nid)) { - case 8: - return (nid); - case 4: - return ((__u32)nid); - default: - fprintf(stderr, "Unexpected sizeof(ptl_nid_t) == %u\n", sizeof(nid)); - abort(); - /* notreached */ - return (-1); - } -} - -char * -ptl_nid2str (char *buffer, ptl_nid_t nid) -{ - __u64 nid64 = ptl_nid2u64(nid); -#ifdef HAVE_GETHOSTBYNAME - struct hostent *he = 0; - - /* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume - * TCP addresses in the 0.x.x.x subnet are not in use. This can - * happen on routers and slows things down a _lot_. Bug 3442. */ - if (nid & 0xff000000) { - __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ - - he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); - } - - if (he != NULL) - sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name); - else -#endif /* HAVE_GETHOSTBYNAME */ - sprintf(buffer, LPX64, nid64); - - return (buffer); -} - -int g_nal_is_set () -{ - if (g_nal == 0) { - fprintf (stderr, "Error: you must run the 'network' command first.\n"); - return (0); - } - - return (1); -} - -int g_nal_is_compatible (char *cmd, ...) -{ - va_list ap; - int nal; - - if (!g_nal_is_set ()) - return (0); - - va_start (ap, cmd); - - do { - nal = va_arg (ap, int); - } while (nal != 0 && nal != g_nal); - - va_end (ap); - - if (g_nal == nal) - return (1); - - if (cmd != NULL) { - /* Don't complain verbosely if we've not been passed a command - * name to complain about! */ - fprintf (stderr, "Command %s not compatible with nal %s\n", - cmd, nal2name (g_nal)); - } - return (0); -} - -int -sock_write (int cfd, void *buffer, int nob) -{ - while (nob > 0) - { - int rc = write (cfd, buffer, nob); - - if (rc < 0) - { - if (errno == EINTR) - continue; - - return (rc); - } - - if (rc == 0) - { - fprintf (stderr, "Unexpected zero sock_write\n"); - abort(); - } - - nob -= rc; - buffer = (char *)buffer + nob; - } - - return (0); -} - -int -sock_read (int cfd, void *buffer, int nob) -{ - while (nob > 0) - { - int rc = read (cfd, buffer, nob); - - if (rc < 0) - { - if (errno == EINTR) - continue; - - return (rc); - } - - if (rc == 0) /* EOF */ - { - errno = ECONNABORTED; - return (-1); - } - - nob -= rc; - buffer = (char *)buffer + nob; - } - - return (0); -} - -int ptl_initialize(int argc, char **argv) -{ - register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); - return 0; -} - - -int jt_ptl_network(int argc, char **argv) -{ - name2num_t *entry; - int nal; - - if (argc == 2 && - (nal = ptl_name2nal (argv[1])) >= 0) { - g_nal = nal; - return (0); - } - - fprintf(stderr, "usage: %s \n", argv[0]); - for (entry = nalnames; entry->name != NULL; entry++) - fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); - fprintf(stderr, ">\n"); - return (-1); -} - -int -jt_ptl_print_interfaces (int argc, char **argv) -{ - struct portals_cfg pcfg; - char buffer[3][64]; - int index; - int rc; - - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) - return -1; - - for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_INTERFACE); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) - break; - - printf ("%s: (%s/%s) npeer %d nroute %d\n", - ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[2], 1), - ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[0], 0), - ptl_ipaddr_2_str(pcfg.pcfg_misc, buffer[1], 0), - pcfg.pcfg_fd, pcfg.pcfg_count); - } - - if (index == 0) - printf ("\n"); - return 0; -} - -int -jt_ptl_add_interface (int argc, char **argv) -{ - struct portals_cfg pcfg; - __u32 ipaddr; - int rc; - __u32 netmask = 0xffffff00; - int i; - int count; - char *end; - - if (argc < 2 || argc > 3) { - fprintf (stderr, "usage: %s ipaddr [netmask]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) - return -1; - - if (ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { - fprintf (stderr, "Can't parse ip: %s\n", argv[1]); - return -1; - } - - if (argc > 2 ) { - count = strtol(argv[2], &end, 0); - if (count > 0 && count < 32 && *end == 0) { - netmask = 0; - for (i = count; i > 0; i--) - netmask = netmask|(1<<(32-i)); - } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) { - fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); - return -1; - } - } - - PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE); - pcfg.pcfg_id = ipaddr; - pcfg.pcfg_misc = netmask; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) { - fprintf (stderr, "failed to add interface: %s\n", - strerror (errno)); - return -1; - } - - return 0; -} - -int -jt_ptl_del_interface (int argc, char **argv) -{ - struct portals_cfg pcfg; - int rc; - __u32 ipaddr = 0; - - if (argc > 2) { - fprintf (stderr, "usage: %s [ipaddr]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) - return -1; - - if (argc == 2 && - ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { - fprintf (stderr, "Can't parse ip: %s\n", argv[1]); - return -1; - } - - PCFG_INIT(pcfg, NAL_CMD_DEL_INTERFACE); - pcfg.pcfg_id = ipaddr; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) { - fprintf (stderr, "failed to delete interface: %s\n", - strerror (errno)); - return -1; - } - - return 0; -} - -int -jt_ptl_print_peers (int argc, char **argv) -{ - struct portals_cfg pcfg; - char buffer[2][64]; - int index; - int rc; - - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) - return -1; - - for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_PEER); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) - break; - - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) - printf (LPX64"[%d]%s@%s:%d #%d\n", - pcfg.pcfg_nid, pcfg.pcfg_wait, - ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1), - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), - pcfg.pcfg_misc, pcfg.pcfg_count); - else - printf (LPX64"[%d]\n", - pcfg.pcfg_nid, pcfg.pcfg_wait); - } - - if (index == 0) - printf ("\n"); - return 0; -} - -int -jt_ptl_add_peer (int argc, char **argv) -{ - struct portals_cfg pcfg; - ptl_nid_t nid; - __u32 ip = 0; - int port = 0; - int rc; - - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) - return -1; - - if (g_nal_is_compatible(NULL, SOCKNAL, RANAL, 0)) { - if (argc != 4) { - fprintf (stderr, "usage(tcp): %s nid ipaddr port\n", - argv[0]); - return 0; - } - } else if (argc != 2) { - fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]); - return 0; - } - - if (ptl_parse_nid (&nid, argv[1]) != 0 || - nid == PTL_NID_ANY) { - fprintf (stderr, "Can't parse NID: %s\n", argv[1]); - return -1; - } - - if (g_nal_is_compatible (NULL, SOCKNAL, RANAL, 0)) { - if (ptl_parse_ipaddr (&ip, argv[2]) != 0) { - fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); - return -1; - } - - if (ptl_parse_port (&port, argv[3]) != 0) { - fprintf (stderr, "Can't parse port: %s\n", argv[3]); - return -1; - } - } - - PCFG_INIT(pcfg, NAL_CMD_ADD_PEER); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ip; - pcfg.pcfg_misc = port; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) { - fprintf (stderr, "failed to add peer: %s\n", - strerror (errno)); - return -1; - } - - return 0; -} - -int -jt_ptl_del_peer (int argc, char **argv) -{ - struct portals_cfg pcfg; - ptl_nid_t nid = PTL_NID_ANY; - __u32 ip = 0; - int single_share = 0; - int argidx; - int rc; - - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) - return -1; - - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { - if (argc > 4) { - fprintf (stderr, "usage: %s [nid] [ipaddr] [single_share]\n", - argv[0]); - return 0; - } - } else if (argc > 3) { - fprintf (stderr, "usage: %s [nid] [single_share]\n", argv[0]); - return 0; - } - - if (argc > 1 && - ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf (stderr, "Can't parse nid: %s\n", argv[1]); - return -1; - } - - argidx = 2; - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { - if (argc > argidx && - ptl_parse_ipaddr (&ip, argv[argidx]) != 0) { - fprintf (stderr, "Can't parse ip addr: %s\n", - argv[argidx]); - return -1; - } - argidx++; - } - - if (argc > argidx) { - if (!strcmp (argv[argidx], "single_share")) { - single_share = 1; - } else { - fprintf (stderr, "Unrecognised arg %s'\n", argv[3]); - return -1; - } - } - - PCFG_INIT(pcfg, NAL_CMD_DEL_PEER); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ip; - pcfg.pcfg_flags = single_share; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) { - fprintf (stderr, "failed to remove peer: %s\n", - strerror (errno)); - return -1; - } - - return 0; -} - -int -jt_ptl_print_connections (int argc, char **argv) -{ - struct portals_cfg pcfg; - char buffer[2][64]; - int index; - int rc; - - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) - return -1; - - for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_CONN); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); - if (rc != 0) - break; - - if (g_nal_is_compatible (NULL, SOCKNAL, 0)) - printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n", - pcfg.pcfg_gw_nal, /* scheduler */ - ptl_ipaddr_2_str (pcfg.pcfg_fd, buffer[0], 1), /* local IP addr */ - pcfg.pcfg_nid, - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), /* remote IP addr */ - pcfg.pcfg_misc, /* remote port */ - (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" : - (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?", - pcfg.pcfg_count, /* tx buffer size */ - pcfg.pcfg_size, /* rx buffer size */ - pcfg.pcfg_wait ? "nagle" : "nonagle"); - else - printf (LPX64"\n", - pcfg.pcfg_nid); - } - - if (index == 0) - printf ("\n"); - return 0; -} - -int jt_ptl_connect(int argc, char **argv) -{ -#ifndef HAVE_CONNECT - /* no connect() support */ - return -1; -#else /* HAVE_CONNECT */ - struct portals_cfg pcfg; - struct sockaddr_in srvaddr; - struct sockaddr_in locaddr; - __u32 ipaddr; - char *flag; - int fd, rc; - int type = SOCKNAL_CONN_ANY; - int port, rport; - int o; - - if (argc < 3) { - fprintf(stderr, "usage: %s ip port [type]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) - return -1; - - rc = ptl_parse_ipaddr (&ipaddr, argv[1]); - if (rc != 0) { - fprintf(stderr, "Can't parse hostname: %s\n", argv[1]); - return -1; - } - - if (ptl_parse_port (&port, argv[2]) != 0) { - fprintf (stderr, "Can't parse port: %s\n", argv[2]); - return -1; - } - - if (argc > 3) - for (flag = argv[3]; *flag != 0; flag++) - switch (*flag) - { - case 'I': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_BULK_IN; - break; - - case 'O': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_BULK_OUT; - break; - - case 'C': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_CONTROL; - break; - - default: - fprintf (stderr, "unrecognised flag '%c'\n", - *flag); - return (-1); - } - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_addr.s_addr = INADDR_ANY; - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(port); - srvaddr.sin_addr.s_addr = htonl(ipaddr); - - - for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", strerror(errno)); - return -1; - } - - o = 1; - rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &o, sizeof(o)); - - locaddr.sin_port = htons(rport); - rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == 0 || errno == EACCES) { - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if (rc == 0) { - break; - } else if (errno != EADDRINUSE) { - fprintf(stderr, "Error connecting to host: %s\n", strerror(errno)); - close(fd); - return -1; - } - } else if (errno != EADDRINUSE) { - fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno)); - close(fd); - return -1; - } - } - - if (rport == IPPORT_RESERVED / 2) { - fprintf(stderr, - "Warning: all privileged ports are in use.\n"); - return -1; - } - - printf("Connected host: %s type: %s\n", - argv[1], - (type == SOCKNAL_CONN_ANY) ? "A" : - (type == SOCKNAL_CONN_CONTROL) ? "C" : - (type == SOCKNAL_CONN_BULK_IN) ? "I" : - (type == SOCKNAL_CONN_BULK_OUT) ? "O" : "?"); - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); - pcfg.pcfg_nal = g_nal; - pcfg.pcfg_fd = fd; - pcfg.pcfg_misc = type; - - rc = pcfg_ioctl(&pcfg); - if (rc) { - fprintf(stderr, "failed to register fd with portals: %s\n", - strerror(errno)); - close (fd); - return -1; - } - - printf("Connection to %s registered with socknal\n", argv[1]); - - rc = close(fd); - if (rc) - fprintf(stderr, "close failed: %d\n", rc); - - return 0; -#endif /* HAVE_CONNECT */ -} - -int jt_ptl_disconnect(int argc, char **argv) -{ - struct portals_cfg pcfg; - ptl_nid_t nid = PTL_NID_ANY; - __u32 ipaddr = 0; - int rc; - - if (argc > 3) { - fprintf(stderr, "usage: %s [nid] [ipaddr]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) - return 0; - - if (argc >= 2 && - ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf (stderr, "Can't parse nid %s\n", argv[1]); - return -1; - } - - if (g_nal_is_compatible (NULL, SOCKNAL, 0) && - argc >= 3 && - ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { - fprintf (stderr, "Can't parse ip addr %s\n", argv[2]); - return -1; - } - - PCFG_INIT(pcfg, NAL_CMD_CLOSE_CONNECTION); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ipaddr; - - rc = pcfg_ioctl(&pcfg); - if (rc) { - fprintf(stderr, "failed to remove connection: %s\n", - strerror(errno)); - return -1; - } - - return 0; -} - -int jt_ptl_push_connection (int argc, char **argv) -{ - struct portals_cfg pcfg; - int rc; - ptl_nid_t nid = PTL_NID_ANY; - __u32 ipaddr = 0; - - if (argc > 3) { - fprintf(stderr, "usage: %s [nid] [ip]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) - return -1; - - if (argc > 1 && - ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf(stderr, "Can't parse nid: %s\n", argv[1]); - return -1; - } - - if (argc > 2 && - ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { - fprintf(stderr, "Can't parse ipaddr: %s\n", argv[2]); - } - - PCFG_INIT(pcfg, NAL_CMD_PUSH_CONNECTION); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ipaddr; - - rc = pcfg_ioctl(&pcfg); - if (rc) { - fprintf(stderr, "failed to push connection: %s\n", - strerror(errno)); - return -1; - } - - return 0; -} - -int -jt_ptl_print_active_txs (int argc, char **argv) -{ - struct portals_cfg pcfg; - int index; - int rc; - - if (!g_nal_is_compatible (argv[0], QSWNAL, 0)) - return -1; - - for (index = 0;;index++) { - PCFG_INIT(pcfg, NAL_CMD_GET_TXDESC); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - break; - - printf ("%p: %5s payload %6d bytes to "LPX64" via "LPX64" by pid %6d: %s, %s, state %d\n", - pcfg.pcfg_pbuf1, - pcfg.pcfg_count == PTL_MSG_ACK ? "ACK" : - pcfg.pcfg_count == PTL_MSG_PUT ? "PUT" : - pcfg.pcfg_count == PTL_MSG_GET ? "GET" : - pcfg.pcfg_count == PTL_MSG_REPLY ? "REPLY" : "", - pcfg.pcfg_size, - pcfg.pcfg_nid, - pcfg.pcfg_nid2, - pcfg.pcfg_misc, - (pcfg.pcfg_flags & 1) ? "delayed" : "immediate", - (pcfg.pcfg_flags & 2) ? "nblk" : "normal", - pcfg.pcfg_flags >> 2); - } - - if (index == 0) - printf ("\n"); - return 0; -} - -int jt_ptl_ping(int argc, char **argv) -{ - int rc; - ptl_nid_t nid; - long count = 1; - long size = 4; - long timeout = 1; - struct portal_ioctl_data data; - - if (argc < 2) { - fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); - return 0; - } - - if (!g_nal_is_set()) - return -1; - - if (ptl_parse_nid (&nid, argv[1]) != 0) - { - fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); - return (-1); - } - - if (argc > 2) - { - count = atol(argv[2]); - - if (count < 0 || count > 20000) - { - fprintf(stderr, "are you insane? %ld is a crazy count.\n", count); - return -1; - } - } - - if (argc > 3) - size= atol(argv[3]); - - if (argc > 4) - timeout = atol (argv[4]); - - PORTAL_IOC_INIT (data); - data.ioc_count = count; - data.ioc_size = size; - data.ioc_nid = nid; - data.ioc_nal = g_nal; - data.ioc_timeout = timeout; - - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); - if (rc) { - fprintf(stderr, "failed to start pinger: %s\n", - strerror(errno)); - return -1; - } - return 0; -} - -int jt_ptl_shownid(int argc, char **argv) -{ - struct portal_ioctl_data data; - int rc; - - if (argc > 1) { - fprintf(stderr, "usage: %s\n", argv[0]); - return 0; - } - - if (!g_nal_is_set()) - return -1; - - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); - if (rc < 0) - fprintf(stderr, "getting my NID failed: %s\n", - strerror (errno)); - else - printf(LPX64"\n", data.ioc_nid); - return 0; -} - -int jt_ptl_mynid(int argc, char **argv) -{ - int rc; - char hostname[1024]; - char *nidstr; - struct portals_cfg pcfg; - ptl_nid_t mynid; - - if (argc > 2) { - fprintf(stderr, "usage: %s [NID]\n", argv[0]); - fprintf(stderr, "NID defaults to the primary IP address of the machine.\n"); - return 0; - } - - if (!g_nal_is_set()) - return -1; - - if (argc >= 2) - nidstr = argv[1]; - else if (gethostname(hostname, sizeof(hostname)) != 0) { - fprintf(stderr, "gethostname failed: %s\n", - strerror(errno)); - return -1; - } - else - nidstr = hostname; - - rc = ptl_parse_nid (&mynid, nidstr); - if (rc != 0) { - fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr); - return -1; - } - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID); - pcfg.pcfg_nid = mynid; - - rc = pcfg_ioctl(&pcfg); - if (rc < 0) - fprintf(stderr, "setting my NID failed: %s\n", - strerror(errno)); - else - printf("registered my nid "LPX64" (%s)\n", - ptl_nid2u64(mynid), hostname); - return 0; -} - -int -jt_ptl_fail_nid (int argc, char **argv) -{ - int rc; - ptl_nid_t nid; - unsigned int threshold; - struct portal_ioctl_data data; - - if (argc < 2 || argc > 3) - { - fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); - return (0); - } - - if (!g_nal_is_set()) - return (-1); - - if (!strcmp (argv[1], "_all_")) - nid = PTL_NID_ANY; - else if (ptl_parse_nid (&nid, argv[1]) != 0) - { - fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); - return (-1); - } - - if (argc < 3) - threshold = PTL_MD_THRESH_INF; - else if (sscanf (argv[2], "%i", &threshold) != 1) { - fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); - return (-1); - } - - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - data.ioc_nid = nid; - data.ioc_count = threshold; - - rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); - if (rc < 0) - fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", - strerror (errno)); - else - printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); - - return (0); -} - -int -jt_ptl_add_route (int argc, char **argv) -{ - struct portals_cfg pcfg; - ptl_nid_t nid1; - ptl_nid_t nid2; - ptl_nid_t gateway_nid; - int rc; - - if (argc < 3) - { - fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); - return (0); - } - - if (!g_nal_is_set()) - return (-1); - - if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) - { - fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); - return (-1); - } - - if (ptl_parse_nid (&nid1, argv[2]) != 0) - { - fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); - return (-1); - } - - if (argc < 4) - nid2 = nid1; - else if (ptl_parse_nid (&nid2, argv[3]) != 0) - { - fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); - return (-1); - } - - PCFG_INIT(pcfg, NAL_CMD_ADD_ROUTE); - pcfg.pcfg_nid = gateway_nid; - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_gw_nal = g_nal; - pcfg.pcfg_nid2 = MIN (nid1, nid2); - pcfg.pcfg_nid3 = MAX (nid1, nid2); - - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - { - fprintf (stderr, "NAL_CMD_ADD_ROUTE failed: %s\n", strerror (errno)); - return (-1); - } - - return (0); -} - -int -jt_ptl_del_route (int argc, char **argv) -{ - struct portals_cfg pcfg; - ptl_nid_t nid; - ptl_nid_t nid1 = PTL_NID_ANY; - ptl_nid_t nid2 = PTL_NID_ANY; - int rc; - - if (argc < 2) - { - fprintf (stderr, "usage: %s targetNID\n", argv[0]); - return (0); - } - - if (!g_nal_is_set()) - return (-1); - - if (ptl_parse_nid (&nid, argv[1]) != 0) - { - fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); - return (-1); - } - - if (argc >= 3 && - ptl_parse_nid (&nid1, argv[2]) != 0) - { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]); - return (-1); - } - - if (argc < 4) { - nid2 = nid1; - } else { - if (ptl_parse_nid (&nid2, argv[3]) != 0) { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]); - return (-1); - } - - if (nid1 > nid2) { - ptl_nid_t tmp = nid1; - - nid1 = nid2; - nid2 = tmp; - } - } - - PCFG_INIT(pcfg, NAL_CMD_DEL_ROUTE); - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_gw_nal = g_nal; - pcfg.pcfg_nid = nid; - pcfg.pcfg_nid2 = nid1; - pcfg.pcfg_nid3 = nid2; - - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - { - fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", - ptl_nid2u64(nid), strerror (errno)); - return (-1); - } - - return (0); -} - -int -jt_ptl_notify_router (int argc, char **argv) -{ - struct portals_cfg pcfg; - int enable; - ptl_nid_t nid; - int rc; - struct timeval now; - time_t when; - - if (argc < 3) - { - fprintf (stderr, "usage: %s targetNID [