From: braam Date: Mon, 19 May 2003 04:27:44 +0000 (+0000) Subject: - add portals to Lustre X-Git-Tag: v1_7_100~1^91~276 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=63aae97fdb49754574fcf6faf4131f78c93fb051;p=fs%2Flustre-release.git - add portals to Lustre - fix up makefiles - add initial part of 2.5 in kernel make infrastructure - cleanup tcpnal prototypes etc. --- diff --git a/lnet/AUTHORS b/lnet/AUTHORS new file mode 100644 index 0000000..e69de29 diff --git a/lnet/ChangeLog b/lnet/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/lnet/Kernelenv.in b/lnet/Kernelenv.in new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lnet/Kernelenv.in @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lnet/Kernelenv.mk b/lnet/Kernelenv.mk new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lnet/Kernelenv.mk @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lnet/Makefile.am b/lnet/Makefile.am new file mode 100644 index 0000000..3c42103 --- /dev/null +++ b/lnet/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = Rules.linux archdep.m4 MCP +DIST_SUBDIRS = libcfs portals knals unals utils tests doc router +SUBDIRS = libcfs portals knals unals utils tests doc router diff --git a/lnet/Makefile.mk b/lnet/Makefile.mk new file mode 100644 index 0000000..be0e51a --- /dev/null +++ b/lnet/Makefile.mk @@ -0,0 +1,6 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += libcfs/ +obj-y += knals/ +obj-y += router/ diff --git a/lnet/NEWS b/lnet/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/lnet/README b/lnet/README new file mode 100644 index 0000000..e69de29 diff --git a/lnet/Rules.linux.in b/lnet/Rules.linux.in new file mode 100644 index 0000000..8247deb --- /dev/null +++ b/lnet/Rules.linux.in @@ -0,0 +1,37 @@ +# included in Linux kernel directories +# Rules for module building + +MODLINK=@MOD_LINK@ +if LINUX25 + + +basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g') +AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2 -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename) + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + + + +else + + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + + + +endif + + +tags: + rm -f $(top_srcdir)/TAGS + rm -f $(top_srcdir)/tags + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a + + + + diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 new file mode 100644 index 0000000..0315644 --- /dev/null +++ b/lnet/archdep.m4 @@ -0,0 +1,206 @@ + +# -------- in kernel compilation? (2.5 only) ------------- +AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles]) +AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) +echo "Makefile for in kernel build: $INKERNEL" + +# -------- liblustre compilation -------------- +AC_ARG_WITH(lib, [ --with-lib compile lustre library], host_cpu="lib") + +# -------- set linuxdir ------------ + +AC_ARG_WITH(linux, [ --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux) +AC_SUBST(LINUX) + +# --------- UML? -------------------- +AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...) +if test $host_cpu = "lib" ; then + host_cpu="lib" + AC_MSG_RESULT(no building Lustre library) +else + if test -e $LINUX/include/asm-um ; then + if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then + host_cpu="um"; + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no (asm doesn't point at asm-um)) + fi + + else + AC_MSG_RESULT(no (asm-um missing)) + fi +fi + +# --------- Linux 25 ------------------ + +AC_MSG_CHECKING(if you are running linux 2.5) +if test -e $LINUX/include/linux/namei.h ; then + linux25="yes" + AC_MSG_RESULT(yes) +else + linux25="no" + AC_MSG_RESULT(no) +fi +AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) +echo "Makefiles for in linux 2.5 build: $LINUX25" + +# ------- Makeflags ------------------ + +AC_MSG_CHECKING(setting make flags system architecture: ) +case ${host_cpu} in + lib ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall ' + KCPPFLAGS='-D__arch_lib__ ' + MOD_LINK=elf_i386 +;; + um ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common ' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include ' + ;; + esac + + MOD_LINK=elf_i386 +;; + i*86 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + ;; + esac + MOD_LINK=elf_i386 +;; + + alphaev6 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alphaev67 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alpha* ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + ia64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' + KCPPFLAGS='-D__KERNEL__ -DMODULE' + MOD_LINK=elf64_ia64 +;; + + sparc64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf64_sparc + +;; + + powerpc ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf32ppclinux +;; + + *) + AC_ERROR("Unknown Linux Platform: $host_cpu") +;; +esac + +# ----------- make dep run? ------------------ + +if test $host_cpu != "lib" ; then + AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) ) + if test -f $LINUX/include/linux/config.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.) + fi +fi + +# ------------ include paths ------------------ + +if test $host_cpu != "lib" ; then + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include -I$(LINUX)/include' +else + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include' +fi +CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS" + +if test $host_cpu != "lib" ; then +# ------------ autoconf.h ------------------ + AC_MSG_CHECKING(if autoconf.h is in kernel source) + if test -f $LINUX/include/linux/autoconf.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.) + fi + +# ------------ RELEASE and moduledir ------------------ + AC_MSG_CHECKING(for Linux release) + + dnl We need to rid ourselves of the nasty [ ] quotes. + changequote(, ) + dnl Get release from version.h + RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`" + changequote([, ]) + + moduledir='$(libdir)/modules/'$RELEASE/kernel + AC_SUBST(moduledir) + + modulefsdir='$(moduledir)/fs/$(PACKAGE)' + AC_SUBST(modulefsdir) + + AC_MSG_RESULT($RELEASE) + AC_SUBST(RELEASE) + +# ---------- modversions? -------------------- + AC_MSG_CHECKING(for MODVERSIONS) + if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1; + then + MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB" + AC_MSG_RESULT(yes) + else + MFLAGS= + AC_MSG_RESULT(no) + fi +fi + +# ---------- SMP ------------------- +#AC_MSG_CHECKING(for SMP) +#if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then +# SMPFLAG= +# AC_MSG_RESULT(yes) +#else +# SMPFLAG= +# AC_MSG_RESULT(no) +#fi + +CFLAGS="$KCFLAGS" +CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS " + +AC_SUBST(MOD_LINK) +AC_SUBST(LINUX25) \ No newline at end of file diff --git a/lnet/autogen.sh b/lnet/autogen.sh new file mode 100644 index 0000000..9deed73 --- /dev/null +++ b/lnet/autogen.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +aclocal && +automake --add-missing && +${AUTOCONF:-autoconf} diff --git a/lnet/build.m4 b/lnet/build.m4 new file mode 100644 index 0000000..4e8dbbb --- /dev/null +++ b/lnet/build.m4 @@ -0,0 +1,108 @@ + +# ---------- directories --------- + + +# --------- unsigned long long sane? ------- + +AC_CHECK_SIZEOF(unsigned long long, 0) +echo "---> size SIZEOF $SIZEOF_unsigned_long_long" +echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" +if test $ac_cv_sizeof_unsigned_long_long != 8 ; then + AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) +fi + +# directories for binaries +ac_default_prefix= +bindir='${exec_prefix}/usr/bin' +sbindir='${exec_prefix}/usr/sbin' +includedir='${prefix}/usr/include' + +# Directories for documentation and demos. +docdir='${prefix}/usr/share/doc/$(PACKAGE)' +AC_SUBST(docdir) +demodir='$(docdir)/demo' +AC_SUBST(demodir) +pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples' +AC_SUBST(pkgexampledir) +pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre' +AC_SUBST(pymoddir) +modulenetdir='$(moduledir)/net/$(PACKAGE)' +AC_SUBST(modulenetdir) + + +# ---------- BAD gcc? ------------ +AC_PROG_RANLIB +AC_PROG_CC +AC_MSG_CHECKING(for buggy compiler) +CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` +bad_cc() { + echo + echo " '$CC_VERSION'" + echo " has been known to generate bad code, " + echo " please get an updated compiler." + AC_MSG_ERROR(sorry) +} +TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` +if test "$TMP_VERSION" = "gcc version 2.95"; then + bad_cc +fi +case "$CC_VERSION" in + # ost_pack_niobuf putting 64bit NTOH temporaries on the stack + # without "sub $0xc,%esp" to protect the stack from being + # stomped on by interrupts (bug 606) + "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") + bad_cc + ;; + # mandrake's similar sub 0xc compiler bug + # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2 + "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") + bad_cc + ;; + *) + AC_MSG_RESULT(no known problems) + ;; +esac +# end ------ BAD gcc? ------------ + +# -------- Check for required packages -------------- + +# this doesn't seem to work on older autoconf +# AC_CHECK_LIB(readline, readline,,) +AC_ARG_ENABLE(readline, [ --enable-readline use readline library],, + enable_readline="yes") + +if test "$enable_readline" = "yes" ; then + LIBREADLINE="-lreadline -lncurses" + HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1" +else + LIBREADLINE="" + HAVE_LIBREADLINE="" +fi +AC_SUBST(LIBREADLINE) +AC_SUBST(HAVE_LIBREADLINE) + +AC_ARG_ENABLE(efence, [ --enable-efence use efence library],, + enable_efence="no") + +if test "$enable_efence" = "yes" ; then + LIBEFENCE="-lefence" + HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1" +else + LIBEFENCE="" + HAVE_LIBEFENCE="" +fi +AC_SUBST(LIBEFENCE) +AC_SUBST(HAVE_LIBEFENCE) + +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) +AC_MSG_CHECKING(if you are building lib lustre) +if test "$host_cpu" = "lib"; then + AC_MSG_RESULT(yes) + libdir='${exec_prefix}/lib/lustre' +else + AC_MSG_RESULT(no) +fi + +# end -------- Kernel build environment. ----------------- + + diff --git a/lnet/configure.in b/lnet/configure.in new file mode 100644 index 0000000..7c32246 --- /dev/null +++ b/lnet/configure.in @@ -0,0 +1,38 @@ +# This version is here to make autoconf happy; the name is a file which is +# "unique" to this directory so that configure knows where it should run. +AC_INIT(knals/Makefile.am, 3.0) +AC_CANONICAL_SYSTEM +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +# Automake variables. Steal the version number from packaging/intersync.spec +AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c])) +# AM_MAINTAINER_MODE + +sinclude(archdep.m4) +sinclude(build.m4) +sincludemalloc() callback does not need to be called +for each object. + +The objects are maintained on a per-object type singly linked free +list and contain a pointer to the next free object. This pointer +is NULL if the object is not on the free list and is non-zero +if it is on the list. The special sentinal value of 0xDEADBEEF +is used to mark the end of the free list since NULL could +indicate that the last object in the list is not free. + +When one of the lib_*_alloc() functions is called, the library +returns the head of the free list and advances the head pointer +to the next item on the list. The special case of 0xDEADBEEF is +checked and a NULL pointer is returned if there are no more +objects of this type available. The lib_*_free() functions +are even simpler -- check to ensure that the object is not already +free, set its next pointer to the current head and then set +the head to be this newly freed object. + +Since C does not have templates, I did the next best thing and wrote +the memory pool allocation code as a macro that expands based on the +type of the argument. The mk_alloc(T) macro expands to +write the _lib_T_alloc() and lib_T_free() functions. +It requires that the object have a pointer of the type T named +"next_free". There are also functions that map _lib_T_alloc() +to lib_T_alloc() so that the library can add some extra +functionality to the T constructor. + + + +LINKED LISTS: +------------ + +Many of the active Portals objects are stored in doubly linked lists +when they are active. These are always implemented with the pointer +to the next object and a pointer to the next pointer of the +previous object. This avoids the "dummy head" object or +special cases for inserting at the beginning or end of the list. +The pointer manipulations are a little hairy at times, but +I hope that they are understandable. + +The actual linked list code is implemented as macros in , +although the object has to know about + + diff --git a/lnet/doc/Makefile.am b/lnet/doc/Makefile.am new file mode 100644 index 0000000..7c65e6c --- /dev/null +++ b/lnet/doc/Makefile.am @@ -0,0 +1,46 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +LYX2PDF = lyx --export pdf +LYX2TXT = lyx --export text +LYX2HTML = lyx --export html +SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps + +DOCS = portals3.pdf +IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps +LYXFILES= portals3.lyx + +MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(GENERATED) +GENERATED = +EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) + +all: $(DOCS) + +# update date and version in document +date := $(shell date +%x) +tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') +addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g' + +# Regenerate when the $(VERSION) or $Name: $ changes. +.INTERMEDIATE: $(GENERATED) +$(GENERATED) : %.lyx: %.lin Makefile + $(addversion) $< > $@ + +.lyx.pdf: + @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" + +.lyx.txt: + @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" +.lyx.html: + @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" +.fig.eps: + -fig2dev -L eps $< > $@ + +portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx + +syncweb: portals3.pdf +# cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf +# ( cd /usr/src/www ; make lustre ; make synclustre ) + diff --git a/lnet/doc/Message-life-cycle b/lnet/doc/Message-life-cycle new file mode 100644 index 0000000..e8cc7e2 --- /dev/null +++ b/lnet/doc/Message-life-cycle @@ -0,0 +1,118 @@ +This documents the life cycle of message as it arrives and is handled by +a basic async, packetized NAL. There are four types of messages that have +slightly different life cycles, so they are addressed independently. + + +Put request +----------- + +1. NAL notices that there is a incoming message header on the network +and reads an ptl_hdr_t in from the wire. + +2. It may store additional NAL specific data that provides context +for this event in a void* that it will interpret in some fashion +later. + +3. The NAL calls lib_parse() with a pointer to the header and its +private data structure. + +4. The library decodes the header and may build a message state +object that describes the event to be written and the ACK to be +sent, if any. It then calls nal->recv() with the private data +that the NAL passed in, a pointer to the message state object +and a translated user address. + + The NAL will have been given a chance to pretranslate + all user addresses when the buffers are created. This + process is described in the NAL-HOWTO. + +5. The NAL should restore what ever context it required from the +private data pointer, begin receiving the bytes and possibly store +some extra state of its own. It should return at this point. + + + +Get request +----------- + +1. As with a Put, the NAL notices the incoming message header and +passes it to lib_parse(). + +2. The library decodes the header and calls nal->recv() with a +zero byte length, offset and destination to instruct it to clean +up the wire after reading the header. The private data will +be passed in as well, allowing the NAL to retrieve any state +or context that it requires. + +3. The library may build a message state object to possibly +write an event log or invalidate a memory region. + +4. The library will build a ptl_msg_t header that specifies the +Portals protocol information for delivery at the remote end. + +5. The library calls nal->send() with the pre-built header, +the optional message state object, the four part address +component, a translated user pointer + offset, and some +other things. + +6. The NAL is to put the header on the wire or copy it at +this point (since it off the stack). It should store some +amount of state about its current position in the message and +the destination address. + +7. And then return to the library. + + +Reply request +------------- + +1. Starting at "The library decodes the header..." + +2. The library decodes the header and calls nal->recv() +to bring in the rest of the message. Flow continues in +exactly the same fashion as with all other receives. + + +Ack request +----------- + +1. The library decodes the header, builds the appropriate data +structures for the event in a message state object and calls nal->recv() +with a zero byte length, etc. + + +Packet arrival +-------------- + +1. The NAL should notice the arrival of a packet, retrieve whatever +state it needs from the message ID or other NAL specific header data +and place the data bytes directly into the user address that were +given to nal->recv(). + + How this happens is outside the scope of the Portals library + and soley determined by the NAL... + +2. If this is the last packet in a message, the NAL should retrieve +the lib_msg_t *cookie that it was given in the call to nal->recv() +and pass it to lib_finalize(). lib_finalize() may call nal->send() +to send an ACK, nal->write() to record an entry in the event log, +nal->invalidate() to unregister a region of memory or do nothing at all. + +3. It should then clean up any remaining NAL specific state about +the message and go back into the main loop. + + +Outgoing packets +---------------- + +1. When the NAL has pending output, it should put the packets on +the wire wrapped with whatever implementation specified wrappers. + +2. Once it has output all the packets of a message it should +call lib_finalize() with the message state object that was +handed to nal->send(). This will allows the library to clean +up its state regarding the message and write any pending event +entries. + + + diff --git a/lnet/doc/NAL-HOWTO b/lnet/doc/NAL-HOWTO new file mode 100644 index 0000000..ea38aed --- /dev/null +++ b/lnet/doc/NAL-HOWTO @@ -0,0 +1,293 @@ +This document is a first attempt at describing how to write a NAL +for the Portals 3 library. It also defines the library architecture +and the abstraction of protection domains. + + +First, an overview of the architecture: + + Application + +----|----+-------- + | + API === NAL (User space) + | +---------+---|----- + | + LIB === NAL (Library space) + | +---------+---|----- + + Physical wire (NIC space) + + +Application + API +API-side NAL +------------ +LIB-side NAL + LIB +LIB-side NAL + wire + +Communication is through the indicated paths via well defined +interfaces. The API and LIB portions are written to be portable +across platforms and do not depend on the network interface. + +Communcation between the application and the API code is +defined in the Portals 3 API specification. This is the +user-visible portion of the interface and should be the most +stable. + + + +API-side NAL: +------------ + +The user space NAL needs to implement only a few functions +that are stored in a nal_t data structure and called by the +API-side library: + + int forward( nal_t *nal, + int index, + void *args, + size_t arg_len, + void *ret, + size_t ret_len + ); + +Most of the data structures in the portals library are held in +the LIB section of the code, so it is necessary to forward API +calls across the protection domain to the library. This is +handled by the NAL's forward method. Once the argument and return +blocks are on the remote side the NAL should call lib_dispatch() +to invoke the appropriate API function. + + int validate( nal_t *nal, + void *base, + size_t extent, + void **trans_base, + void **trans_data + ); + +The validate method provides a means for the NAL to prevalidate +and possibly pretranslate user addresses into a form suitable +for fast use by the network card or kernel module. The trans_base +pointer will be used by the library everytime it needs to +refer to the block of memory. The trans_data result is a +cookie that will be handed to the NAL along with the trans_base. + +The library never performs calculations on the trans_base value; +it only computes offsets that are then handed to the NAL. + + + int shutdown( nal_t *nal, int interface ); + +Brings down the network interface. The remote NAL side should +call lib_fini() to bring down the library side of the network. + + void yield( nal_t *nal ); + +This allows the user application to gracefully give up the processor +while busy waiting. Performance critical applications may not +want to take the time to call this function, so it should be an +option to the PtlEQWait call. Right now it is not implemented as such. + +Lastly, the NAL must implement a function named PTL_IFACE_*, where +* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR. +This initialization function is to set up communication with the +library-side NAL, which should call lib_init() to bring up the +network interface. + + + +LIB-side NAL: +------------ + +On the library-side, the NAL has much more responsibility. It +is responsible for calling lib_dispatch() on behalf of the user, +it is also responsible for bringing packets off the wire and +pushing bits out. As on the user side, the methods are stored +in a nal_cb_t structure that is defined on a per network +interface basis. + +The calls to lib_dispatch() need to be examined. The prototype: + + void lib_dispatch( + nal_cb_t *nal, + void *private, + int index, + void *arg_block, + void *ret_block + ); + +has two complications. The private field is a NAL-specific +value that will be passed to any callbacks produced as a result +of this API call. Kernel module implementations may use this +for task structures, or perhaps network card data. It is ignored +by the library. + +Secondly, the arg_block and ret_block must be in the same protection +domain as the library. The NAL's two halves must communicate the +sizes and perform the copies. After the call, the buffer pointed +to by ret_block will be filled in and should be copied back to +the user space. How this is to be done is NAL specific. + + int lib_parse( + nal_cb_t *nal, + ptl_hdr_t *hdr, + void *private + ); + +This is the only other entry point into the library from the NAL. +When the NAL detects an incoming message on the wire it should read +sizeof(ptl_hdr_t) bytes and pass a pointer to the header to +lib_parse(). It may set private to be anything that it needs to +tie the incoming message to callbacks that are made as a result +of this event. + +The method calls are: + + int (*send)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int nid, + int pid, + int gid, + int rid, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t len + ); + +This is a tricky function -- it must support async output +of messages as well as properly syncronized event log writing. +The private field is the same that was passed into lib_dispatch() +or lib_parse() and may be used to tie this call to the event +that initiated the entry to the library. + +The cookie is a pointer to a library private value that must +be passed to lib_finalize() once the message has been completely +sent. It should not be examined by the NAL for any meaning. + +The four ID fields are passed in, although some implementations +may not use all of them. + +The single base pointer has been replaced with the translated +address that the API NAL generated in the api_nal->validate() +call. The trans_data is unchanged and the offset is in bytes. + + + int (*recv)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t mlen, + size_t rlen + ); + +This callback will only be called in response to lib_parse(). +The cookie, trans_addr and trans_data are as discussed in send(). +The NAL should read mlen bytes from the wire, deposit them into +trans_base + offset and then discard (rlen - mlen) bytes. +Once the entire message has been received the NAL should call +lib_finalize() with the lib_msg_t *cookie. + +The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0 +is used to indicate that the NAL should clean up the wire. This could +be implemented as a blocking call, although having it return as quickly +as possible is desirable. + + int (*write)( + nal_cb_t *nal, + void *private, + user_ptr trans_addr, + user_ptr trans_data, + size_t offset, + + void *src_addr, + size_t len + ); + +This is essentially a cross-protection domain memcpy(). The user address +has been pretranslated by the api_nal->translate() call. + + void *(*malloc)( + nal_cb_t *nal, + size_t len + ); + + void (*free)( + nal_cb_t *nal, + void *buf + ); + +Since the NAL may be in a non-standard hosted environment it can +not call malloc(). This allows the library side NAL to implement +the system specific malloc(). In the current reference implementation +the libary only calls nal->malloc() when the network interface is +initialized and then calls free when it is brought down. The library +maintains its own pool of objects for allocation so only one call to +malloc is made per object type. + + void (*invalidate)( + nal_cb_t *nal, + user_ptr trans_base, + user_ptr trans_data, + size_t extent + ); + +User addresses are validated/translated at the user-level API NAL +method, which is likely to push them to this level. Meanwhile, +the library NAL will be notified when the library no longer +needs the buffer. Overlapped buffers are not detected by the +library, so the NAL should ref count each page involved. + +Unfortunately we have a few bugs when the invalidate method is +called. It is still in progress... + + void (*printf)( + nal_cb_t *nal, + const char *fmt, + ... + ); + +As with malloc(), the library does not have any way to do printf +or printk. It is not necessary for the NAL to implement the this +call, although it will make debugging difficult. + + void (*cli)( + nal_cb_t *nal, + unsigned long *flags + ); + + void (*sti)( + nal_cb_t *nal, + unsigned long *flags + ); + +These are used by the library to mark critical sections. + + int (*gidrid2nidpid)( + nal_cb_t *nal, + ptl_id_t gid, + ptl_id_t rid, + ptl_id_t *nid, + ptl_id_t *pid + ); + + + int (*nidpid2gidrid)( + nal_cb_t *nal, + ptl_id_t nid, + ptl_id_t pid, + ptl_id_t *gid, + ptl_id_t *rid + ); + +Rolf added these. I haven't looked at how they have to work yet. diff --git a/lnet/doc/file.fig b/lnet/doc/file.fig new file mode 100644 index 0000000..914c294 --- /dev/null +++ b/lnet/doc/file.fig @@ -0,0 +1,111 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1200 750 1650 1050 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1050 1650 750 1200 750 1200 1050 1650 1050 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001 +-6 +6 1200 2325 1650 2625 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2625 1650 2325 1200 2325 1200 2625 1650 2625 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001 +-6 +6 1200 1800 1650 2100 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2100 1650 1800 1200 1800 1200 2100 1650 2100 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001 +-6 +6 1200 1275 1650 1575 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1575 1650 1275 1200 1275 1200 1575 1650 1575 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001 +-6 +6 450 750 900 1200 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 825 450 1050 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1050 900 825 +-6 +6 450 2325 900 2775 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 2400 450 2625 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2625 900 2400 +-6 +6 450 1800 900 2250 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1875 450 2100 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2100 900 1875 +-6 +6 450 1275 900 1725 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1350 450 1575 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1575 900 1350 +-6 +6 2250 750 3450 2625 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1200 3150 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1500 3150 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1800 3150 1800 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2100 3150 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 975 3150 975 3150 2625 2550 2625 2550 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2400 3150 2400 +4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2400 2550 1350 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1875 2550 1050 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1425 2550 1950 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 900 2550 1650 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 900 1200 900 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1425 1200 1425 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1950 1200 1950 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2475 1200 2475 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2025 2550 2250 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2550 2550 2475 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1875 2850 1875 600 225 600 225 2850 1875 2850 +4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001 diff --git a/lnet/doc/flow_new.fig b/lnet/doc/flow_new.fig new file mode 100644 index 0000000..d828dea --- /dev/null +++ b/lnet/doc/flow_new.fig @@ -0,0 +1,213 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 525 2175 1575 2925 +6 675 2287 1425 2812 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001 +4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 2550 1050 2175 525 2550 1050 2925 1575 2550 +-6 +6 3450 1275 4350 1725 +6 3600 1312 4200 1687 +4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001 +4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3450 1275 4350 1275 4350 1725 3450 1725 3450 1275 +-6 +6 4650 1275 5550 1725 +6 4725 1312 5475 1687 +4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001 +4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4650 1275 5550 1275 5550 1725 4650 1725 4650 1275 +-6 +6 1350 525 2250 975 +6 1350 562 2250 937 +4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001 +4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 525 2250 525 2250 975 1350 975 1350 525 +-6 +6 525 1125 1575 1875 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 1500 1050 1125 525 1500 1050 1875 1575 1500 +4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001 +-6 +6 2340 1237 2940 1687 +6 2340 1237 2940 1687 +4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001 +4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001 +4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001 +-6 +-6 +6 525 3225 1575 3975 +6 675 3375 1425 3750 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 525 3600 1050 3225 1575 3600 1050 3975 525 3600 +-6 +6 3300 3375 4350 3825 +6 3300 3412 4350 3787 +4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3300 3375 4350 3375 4350 3825 3300 3825 3300 3375 +-6 +6 1950 3225 3000 3975 +6 2250 3450 2700 3750 +4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 3000 3600 2475 3225 1950 3600 2475 3975 3000 3600 +-6 +6 3150 4500 4200 4950 +6 3150 4537 4200 4912 +4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3150 4500 4200 4500 4200 4950 3150 4950 3150 4500 +-6 +6 600 4500 1500 4950 +6 675 4537 1425 4912 +4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001 +4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 600 4500 1500 4500 1500 4950 600 4950 600 4500 +-6 +6 4650 4350 5700 5100 +6 4950 4537 5400 4912 +6 4950 4537 5400 4912 +4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001 +4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001 +-6 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 5700 4725 5175 4350 4650 4725 5175 5100 5700 4725 +-6 +6 6000 4500 6900 4950 +6 6225 4575 6675 4875 +4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001 +4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 6000 4500 6900 4500 6900 4950 6000 4950 6000 4500 +-6 +6 1800 4350 2850 5100 +6 2100 4575 2550 4875 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 2850 4725 2325 4350 1800 4725 2325 5100 2850 4725 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 1875 1050 2175 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 1500 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 450 1050 1125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1350 750 1050 750 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 2925 1050 3225 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3150 1500 3450 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4350 1500 4650 1500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2100 1500 2625 1125 3150 1500 2625 1875 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 3600 1950 3600 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 3975 1050 4500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 3600 3300 3600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 4725 1800 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 5700 4725 6000 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2850 4725 3150 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4200 4725 4650 4725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 6900 4725 7950 4725 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1575 2550 1650 2550 1800 2550 1800 2400 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 2250 750 2475 750 2625 750 2625 900 2625 1125 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 7500 4725 7500 1650 7500 1500 7350 1500 5550 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 2475 3225 2475 2400 2475 2250 2325 2250 1800 2250 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 3825 3375 3825 2175 3825 2025 3675 2025 1800 2025 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125 + 4425 4275 4425 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125 + 7275 4275 7275 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001 +4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001 diff --git a/lnet/doc/get.fig b/lnet/doc/get.fig new file mode 100644 index 0000000..28db949 --- /dev/null +++ b/lnet/doc/get.fig @@ -0,0 +1,33 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 2775 900 3525 1200 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001 +-6 +6 1350 1725 2175 2025 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 750 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 825 2700 1275 +2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1350 900 1950 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 diff --git a/lnet/doc/ieee.bst b/lnet/doc/ieee.bst new file mode 100644 index 0000000..5367caa --- /dev/null +++ b/lnet/doc/ieee.bst @@ -0,0 +1,1114 @@ +% --------------------------------------------------------------- +% +% $Id: ieee.bst,v 1.1.2.1 2003/05/19 04:25:30 braam Exp $ +% +% by Paolo.Ienne@di.epfl.ch +% +% --------------------------------------------------------------- +% +% no guarantee is given that the format corresponds perfectly to +% IEEE 8.5" x 11" Proceedings, but most features should be ok. +% +% --------------------------------------------------------------- +% +% `ieee' from BibTeX standard bibliography style `abbrv' +% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. +% Copyright (C) 1985, all rights reserved. +% Copying of this file is authorized only if either +% (1) you make absolutely no changes to your copy, including name, or +% (2) if you do make changes, you name it something other than +% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. +% This restriction helps ensure that all standard styles are identical. +% The file btxbst.doc has the documentation for this style. + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ } + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { month empty$ + { "" } + { "there's a month but no year in " cite$ * warning$ + month + } + if$ + } + { month empty$ + 'year + { month " " * year * } + if$ + } + if$ +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":" * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { "In " booktitle emphasize * } + { "In " format.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + key empty$ not and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In {\em " booktitle * "\/}" * } + if$ + } + { "In " key * } + if$ + } + { "In " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + format.date "year" output.check + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.btitle "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization output + address output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title howpublished new.block.checkb + format.title output + howpublished new.block.checka + howpublished output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.btitle "title" output.check + new.block + "PhD thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors output.nonnull } + if$ + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address empty$ + { editor empty$ + { publisher new.sentence.checka } + { organization publisher new.sentence.checkb + organization output + } + if$ + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + editor empty$ + 'skip$ + { organization output } + if$ + publisher output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.organization.sort} +{ author empty$ + { organization empty$ + { key empty$ + { "to sort, need author, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.organization.sort} +{ editor empty$ + { organization empty$ + { key empty$ + { "to sort, need editor, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.organization.sort + { type$ "manual" = + 'author.organization.sort + 'author.sort + if$ + } + if$ + } + if$ + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * + "}\setlength{\itemsep}{-1ex}\small" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} + +% end of file ieee.bst +% --------------------------------------------------------------- diff --git a/lnet/doc/mpi.fig b/lnet/doc/mpi.fig new file mode 100644 index 0000000..e1a91b5 --- /dev/null +++ b/lnet/doc/mpi.fig @@ -0,0 +1,117 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 150 1650 900 2025 +4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001 +4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001 +-6 +6 150 150 900 525 +4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001 +4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001 +-6 +6 2550 4125 3150 4725 +4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001 +4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001 +4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001 +-6 +6 1050 1575 1950 1875 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 1575 1950 1575 1950 1875 1050 1875 1050 1575 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001 +-6 +6 5400 1575 6300 2175 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 1575 6300 1575 6300 2175 5400 2175 5400 1575 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001 +-6 +6 5400 2400 6300 3000 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 2400 6300 2400 6300 3000 5400 3000 5400 2400 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001 +-6 +6 1050 2400 1950 2700 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 2400 1950 2400 1950 2700 1050 2700 1050 2400 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001 +-6 +6 1050 825 1950 1125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 825 1950 825 1950 1125 1050 1125 1050 825 +4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1575 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2025 4050 3375 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 675 6600 675 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 1350 6600 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 4125 3300 4125 3300 4725 2400 4725 2400 4125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 4500 4050 3675 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 1725 5400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2550 5400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2850 4050 3450 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1800 1500 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 825 3300 825 3300 1275 2400 1275 2400 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 2625 1500 4125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 4125 1950 4125 1950 4425 1050 4425 1050 4125 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 300 1500 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 975 2400 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 1725 2400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 2550 2400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 4275 2400 4275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 1575 3300 1575 3300 2175 2400 2175 2400 1575 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 2400 3300 2400 3300 3000 2400 3000 2400 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4050 3300 5250 3300 5250 3750 4050 3750 4050 3300 +4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001 +4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001 +4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001 +4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001 +4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001 +4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001 +4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001 diff --git a/lnet/doc/portals.fig b/lnet/doc/portals.fig new file mode 100644 index 0000000..9b1271b --- /dev/null +++ b/lnet/doc/portals.fig @@ -0,0 +1,68 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 900 1650 900 1650 1200 1350 1200 1350 900 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1800 1350 2100 1350 2100 1650 1800 1650 1800 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2250 1800 2550 1800 2550 2100 2250 2100 2250 1800 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 4200 375 4200 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 525 600 1125 600 1125 2100 525 2100 525 600 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4425 1275 4875 1275 4875 1950 4425 1950 4425 1275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 1200 3150 1200 3150 1500 2550 1500 2550 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 1425 4425 1425 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3600 825 3750 825 3750 1125 3600 1125 3600 825 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2025 1425 2550 1425 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 4425 750 4875 750 4875 1125 4425 1125 4425 750 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3675 975 4425 975 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2 + 0 0 1.00 60.00 120.00 + 825 1050 1350 1050 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1350 1500 1500 1650 1500 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1950 1575 1950 1800 1950 1950 2100 1950 2250 1950 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 975 1125 975 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 1125 1125 1125 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7 + 0 0 1.00 60.00 120.00 + 3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975 + 3600 975 + 0.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001 +4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001 +4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001 +4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001 +4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001 +4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001 +4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001 diff --git a/lnet/doc/portals3.bib b/lnet/doc/portals3.bib new file mode 100644 index 0000000..323b99f --- /dev/null +++ b/lnet/doc/portals3.bib @@ -0,0 +1,124 @@ +@Article{ Cplant, + title = { {M}assively {P}arallel {C}omputing with + {C}ommodity {C}omponents }, + author = { Ron Brightwell and David S. Greenberg and Arthur + B. Maccabe and Rolf Riesen }, + journal = { Parallel Computing }, + volume = { 26 }, + month = { February }, + pages = { 243-266 }, + year = { 2000 } +} + +@Manual{ Portals, + organization = { Sandia National Laboratories }, + title = { {P}uma {P}ortals }, + note = { http://www.cs.sandia.gov/puma/portals }, + year = { 1997 } +} + +@Techreport{ VIA, + title = { {V}irtual {I}nterface {A}rchitecture + {S}pecification {V}ersion 1.0 }, + author = { {Compaq, Microsoft, and Intel} }, + institution = { Compaq, Microsoft, and Intel }, + month = { December }, + year = { 1997 } +} + +@Techreport{ ST, + title = { {I}nformation {T}echnology - {S}cheduled + {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 }, + author = { {Task Group of Technical Committee T11} }, + institution = { Accredited Standards Committee NCITS }, + month = { July }, + year = { 1998 } +} + +@Manual{ TFLOPS, + organization = { Sandia National Laboratories }, + title = { ASCI Red }, + note = { http://www.sandia.gov/ASCI/TFLOP }, + year = { 1996 } +} + +@Techreport{ GM, + title = { The {GM} {M}essage {P}assing {S}ystem }, + author = { {Myricom, Inc.} }, + institution = { {Myricom, Inc.} }, + year = { 1997 }, +} + +@Article{ MPIstandard, + title = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard }, + author = { {Message Passing Interface Forum} }, + journal = { The International Journal of Supercomputer Applications + and High Performance Computing }, + volume = { 8 }, + year = { 1994 } +} + +@Inproceedings{ PumaOS, + author = "Lance Shuler and Chu Jong and Rolf Riesen and + David van Dresser and Arthur B. Maccabe and + Lee Ann Fisk and T. Mack Stallcup", + booktitle = "Proceeding of the 1995 Intel Supercomputer + User's Group Conference", + title = "The {P}uma Operating System for Massively Parallel Computers", + organization = "Intel Supercomputer User's Group", + year = 1995 +} + +@InProceedings{ SUNMOS, +author = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and + Stephen R. Wheat", +title = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide", +booktitle = "Proceedings of the {Intel} Supercomputer Users' Group. 1994 + Annual North America Users' Conference.", +year = 1994, +pages = "245--251", +month = "June", +location = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps" +} + +@InProceedings { PumaMPI, + title = { Design and Implementation of {MPI} on {P}uma Portals }, + author = { Ron Brightwell and Lance Shuler }, + booktitle = { Proceedings of the Second MPI Developer's Conference }, + pages = { 18-25 }, + month = { July }, + year = { 1996 } +} + +@Inproceedings{ FM2, + author = { Mario Lauria and Scott Pakin and Andrew Chien }, + title = { {E}fficient {L}ayering for {H}igh {S}peed + {C}ommunication: {F}ast {M}essages 2.x }, + Booktitle = { Proceedings of the IEEE International Symposium + on High Performance Distributed Computing }, + year = { 1998 } +} + +@Manual { CraySHMEM, + title = "SHMEM Technical Note for C, SG-2516 2.3", + organization = "Cray Research, Inc.", + month = "October", + year = 1994 +} + +@Manual { MPI2, + title = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface", + organization = "Message Passing Interface Forum", + note = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html", + month = "July", + year = 1997 +} + +@InProceedings { PMMPI, + title = { {The Design and Implementation of Zero Copy MPI Using + Commodity Hardware with a High Performance Network} }, + author = { Francis O'Carroll and Hiroshi Tezuka and Atsushi Hori + and Yutaka Ishikawa }, + booktitle = { Proceedings of the ICS }, + year = { 1998 } +} diff --git a/lnet/doc/portals3.lyx b/lnet/doc/portals3.lyx new file mode 100644 index 0000000..f3c24e0 --- /dev/null +++ b/lnet/doc/portals3.lyx @@ -0,0 +1,15946 @@ +#LyX 1.2 created this file. For more info see http://www.lyx.org/ +\lyxformat 220 +\textclass report +\begin_preamble +\usepackage{fullpage} +\renewenvironment{comment}% +{\begin{quote}\textbf{Discussion}: \slshape}% +{\end{quote}} +\pagestyle{myheadings} +\markboth{$Revision: 1.1.2.1 $\hfil$Date: 2003/05/19 04:25:30 $}% +{$Date: 2003/05/19 04:25:30 $\hfil$Revision: 1.1.2.1 $} +\end_preamble +\language american +\inputencoding auto +\fontscheme pslatex +\graphics default +\paperfontsize 10 +\spacing single +\papersize letterpaper +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 2 +\tocdepth 2 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 2 +\paperpagestyle headings + +\layout Title + +The Portals 3.2 Message Passing Interface +\newline + Revision 1.1 +\layout Author + +Ron Brightwell +\begin_inset Foot +collapsed true + +\layout Standard + +R. + Brightwell and R. + Riesen are with the Scalable Computing Systems Department, Sandia National + Laboratories, P.O. + Box 5800, Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov. +\end_inset + +, Arthur B. + Maccabe +\begin_inset Foot +collapsed true + +\layout Standard + +A. + B. + Maccabe is with the Computer Science Department, University of New Mexico, + Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87131-1386, maccabe@cs.unm.edu. +\end_inset + +, Rolf Riesen and Trammell Hudson +\layout Abstract + +This report presents a specification for the Portals 3.2 message passing + interface. + Portals 3.2 is intended to allow scalable, high-performance network communicatio +n between nodes of a parallel computing system. + Specifically, it is designed to support a parallel computing platform composed + of clusters of commodity workstations connected by a commodity system area + network fabric. + In addition, Portals 3.2 is well suited to massively parallel processing + and embedded systems. + Portals 3.2 represents an adaption of the data movement layer developed + for massively parallel processing platforms, such as the 4500-node Intel + TeraFLOPS machine. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +clearpage +\backslash +pagenumbering{roman} +\backslash +setcounter{page}{3} +\end_inset + + +\layout Standard + + +\begin_inset LatexCommand \tableofcontents{} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList figure + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList table + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Chapter* + +Summary of Changes for Revision 1.1 +\layout Enumerate + +Updated version number to 3.2 throughout the document +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sub:PtlGetId} + +\end_inset + +: added +\family typewriter +PTL_SEGV +\family default + to error list for +\shape italic +PtlGetId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: added +\family typewriter +PTL_ML_TOOLONG +\family default + to error list for +\shape italic +PtlMEAttach +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meunlink} + +\end_inset + +: removed text referring to a list of associated memory descriptors. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added text to describe unlinking a free-floating memory descriptor. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added entry for +\family typewriter +ptl_seq_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added definition of +\family typewriter +max_offset +\family default +. +\layout Enumerate + +added text to clarify +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default +. +\end_deeper +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: modified text for +\family typewriter +unlink_op +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: added text to clarify multiple calls to +\shape italic +PtlNIInit +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: added text to clarify +\family typewriter +unlink_nofit +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:receiving} + +\end_inset + +: removed text indicating that an MD will reject a message if the associated + EQ is full. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + error code and text to indicate that only MDs with no pending operations + can be unlinked. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + return code. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added user id field, MD handle field, and NI specific failure field to + the +\family typewriter +ptl_event_t +\family default + structure. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added +\family typewriter +ptl_ni_fail_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added +\family typewriter +PTL_EVENT_UNLINK +\family default + event type. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: removed +\shape slanted +PtlTransId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +: listed allowable constants with relevant fields. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: added +\shape italic +PtlMEAttachAny +\shape default + function. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_PT_FULL +\family default + return code for +\shape italic +PtlMEAttachAny +\shape default +. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + +: updated to reflect new event types. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: added +\family typewriter +ptl_nid_t +\family default +, +\family typewriter +ptl_pid_t +\family default +, and +\family typewriter +ptl_uid_t +\family default +. +\layout Chapter* + +Summary of Changes for Version 3.1 +\layout Section* + +Thread Issues +\layout Standard + +The most significant change to the interface from version 3.0 to 3.1 involves + the clarification of how the interface interacts with multi-threaded applicatio +ns. + We adopted a generic thread model in which processes define an address + space and threads share the address space. + Consideration of the API in the light of threads lead to several clarifications + throughout the document: +\layout Enumerate + +Glossary: +\begin_deeper +\layout Enumerate + +added a definition for +\emph on +thread +\emph default +, +\layout Enumerate + +reworded the definition for +\emph on +process +\emph default +. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +: added section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:threads} + +\end_inset + + to describe the multi-threading model used by the Portals API. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlinit} + +\end_inset + +: +\emph on +PtlInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlfini} + +\end_inset + +: +\emph on +PtlFini +\emph default + should be called once as the process is terminating and not as each thread + terminates. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +: Portals does not define thread ids. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + +: network interfaces are associated with processes, not threads. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: +\emph on +PtlNIInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqget} + +\end_inset + +: +\emph on +PtlEQGet +\emph default + returns +\family typewriter +PTL_EQ_EMPTY +\family default + if a thread is blocked on +\emph on +PtlEQWait +\emph default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqwait} + +\end_inset + +: waiting threads are awakened in FIFO order. + +\layout Standard + +Two functions, +\emph on +PtlNIBarrier +\emph default + and +\emph on +PtlEQCount +\emph default + were removed from the API. + +\emph on +PtlNIBarrier +\emph default + was defined to block the calling process until all of the processes in + the application group had invoked +\emph on +PtlNIBarrier +\emph default +. + We now consider this functionality, along with the concept of groups (see + the discussion under +\begin_inset Quotes eld +\end_inset + +other changes +\begin_inset Quotes erd +\end_inset + +), to be part of the runtime system, not part of the Portals API. + +\emph on +PtlEQCount +\emph default + was defined to return the number of events in an event queue. + Because external operations may lead to new events being added and other + threads may remove events, the value returned by +\emph on +PtlEQCount +\emph default + would have to be a hint about the number of events in the event queue. +\layout Section* + +Handling small, unexpected messages +\layout Standard + +Another set of changes relates to handling small unexpected messages in + MPI. + In designing version 3.0, we assumed that each unexpected message would + be placed in a unique memory descriptor. + To avoid the need to process a long list of memory descriptors, we moved + the memory descriptors out of the match list and hung them off of a single + match list entry. + In this way, large unexpected messages would only encounter a single +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + match list entry before encountering the +\begin_inset Quotes eld +\end_inset + +long message +\begin_inset Quotes erd +\end_inset + + match list entry. + Experience with this strategy identified resource management problems with + this approach. + In particular, a long sequence of very short (or zero length) messages + could quickly exhaust the memory descriptors constructed for handling unexpecte +d messages. + Our new strategy involves the use of several very large memory descriptors + for small unexpected messages. + Consecutive unexpected messages will be written into the first of these + memory descriptors until the memory descriptor fills up. + When the first of the +\begin_inset Quotes eld +\end_inset + +small memory +\begin_inset Quotes erd +\end_inset + + descriptors fills up, it will be unlinked and subsequent short messages + will be written into the next +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor. + In this case, a +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor will be declared full when it does not have sufficient + space for the largest small unexpected message. +\layout Standard + +This lead to two significant changes. + First, each match list entry now has a single memory descriptor rather + than a list of memory descriptors. + Second, in addition to exceeding the operation threshold, a memory descriptor + can be unlinked when the local offset exceeds a specified value. + These changes have lead to several changes in this document: +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{subsec:paddress} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed references to the memory descriptor list, +\layout Enumerate + +changed the portals address translation description to indicate that unlinking + a memory descriptor implies unlinking the associated match list entry--match + list entries can no longer be unlinked independently from the memory descriptor. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed unlink from argument list, +\layout Enumerate + +removed description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +changed wording of the error condition when the Portal table index already + has an associated match list. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +: removed unlink from argument list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: added +\family typewriter +max_offset +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +removed reference to memory descriptor lists, +\layout Enumerate + +changed wording of the error condition when match list entry already has + an associated memory descriptor, +\layout Enumerate + +changed the description of the +\family typewriter +unlink +\family default + argument. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +: removed +\family typewriter +PtlMDInsert +\family default + operation. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: removed references to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: removed references to PtlMDInsert. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + +: revised the MPI example to reflect the changes to the interface. + +\layout Standard + +Several changes have been made to improve the general documentation of the + interface. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_EQ_NONE +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_ID_ANY +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: documented the return value +\family typewriter +PTL_INV_EQ +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + +: clarified the description of the +\emph on +PtlMDUpdate +\emph default + function. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:implvals} + +\end_inset + +: introduced a new section to document the implementation defined values. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: modified Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + to indicate where each constant is introduced and where it is used. + +\layout Section* + +Other changes +\layout Subsection* + +Implementation defined limits (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +The earlier version provided implementation defined limits for the maximum + number of match entries, the maximum number of memory descriptors, etc. + Rather than spanning the entire implementation, these limits are now associated + with individual network interfaces. +\layout Subsection* + +Added User Ids (Section +\begin_inset LatexCommand \ref{sec:uid} + +\end_inset + +) +\layout Standard + +Group Ids had been used to simplify access control entries. + In particular, a process could allow access for all of the processes in + a group. + User Ids have been introduced to regain this functionality. + We use user ids to fill this role. +\layout Subsection* + +Removed Group Ids and Rank Ids (Section +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +) +\layout Standard + +The earlier version of Portals had two forms for addressing processes: and . + A process group was defined as the collection processes created during + application launch. + Each process in the group was given a unique rank id in the range 0 to + +\begin_inset Formula $n-1$ +\end_inset + + where +\begin_inset Formula $n$ +\end_inset + + was the number of processes in the group. + We removed groups because they are better handled in the runtime system. +\layout Subsection* + +Match lists (Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +) +\layout Standard + +It is no longer illegal to have an existing match entry when calling PtlMEAttach. + A position argument was added to the list of arguments supplied to +\emph on +PtlMEAttach +\emph default + to specify whether the new match entry is prepended or appended to the + existing list. + If there is no existing match list, the position argument is ignored. +\layout Subsection* + +Unlinking Memory Descriptors (Section +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +) +\layout Standard + +Previously, a memory descriptor could be unlinked if the offset exceeded + a threshold upon the completion of an operation. + In this version, the unlinking is delayed until there is a matching operation + which requires more memory than is currently available in the descriptor. + In addition to changes in section, this lead to a revision of Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + +. +\layout Subsection* + +Split Phase Operations and Events (Section +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + +) +\layout Standard + +Previously, there were five types of events: +\family typewriter +PTL_EVENT_PUT +\family default +, +\family typewriter +PTL_EVENT_GET +\family default +, +\family typewriter +PTL_EVENT_REPLY +\family default +, +\family typewriter +PTL_EVENT_SENT +\family default +, and +\family typewriter +PTL_EVENT_ACK. + +\family default +The first four of these reflected the completion of potentially long operations. + We have introduced new event types to reflect the fact that long operations + have a distinct starting point and a distinct completion point. + Moreover, the completion may be successful or unsuccessful. +\layout Standard + +In addition to providing a mechanism for reporting failure to higher levels + of software, this split provides an opportunity for for improved ordering + semantics. + Previously, if one process intiated two operations (e.g., two put operations) + on a remote process, these operations were guaranteed to complete in the + same order that they were initiated. + Now, we only guarantee that the initiation events are delivered in the + same order. + In particular, the operations do not need to complete in the order that + they were intiated. +\layout Subsection* + +Well known proces ids (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +To support the notion of +\begin_inset Quotes eld +\end_inset + +well known process ids, +\begin_inset Quotes erd +\end_inset + + we added a process id argument to the arguments for PtlNIInit. +\layout Chapter* + +Glossary +\layout Description + +API Application Programming Interface. + A definition of the functions and semantics provided by library of functions. + +\layout Description + +Initiator A +\emph on +process +\emph default + that initiates a message operation. + +\layout Description + +Message An application-defined unit of data that is exchanged between +\emph on +processes +\emph default +. + +\layout Description + +Message\SpecialChar ~ +Operation Either a put operation, which writes data, or a get operation, + which reads data. + +\layout Description + +Network A network provides point-to-point communication between +\emph on +nodes +\emph default +. + Internally, a network may provide multiple routes between endpoints (to + improve fault tolerance or to improve performance characteristics); however, + multiple paths will not be exposed outside of the network. + +\layout Description + +Node A node is an endpoint in a +\emph on +network +\emph default +. + Nodes provide processing capabilities and memory. + A node may provide multiple processors (an SMP node) or it may act as a + +\emph on +gateway +\emph default + between networks. + +\layout Description + +Process A context of execution. + A process defines a virtual memory (VM) context. + This context is not shared with other processes. + Several threads may share the VM context defined by a process. + +\layout Description + +Target A +\emph on +process +\emph default + that is acted upon by a message operation. + +\layout Description + +Thread A context of execution that shares a VM context with other threads. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\layout Standard + +\backslash +setcounter{page}{1} +\backslash +pagenumbering{arabic} +\end_inset + + +\layout Chapter + +Introduction +\begin_inset LatexCommand \label{sec:intro} + +\end_inset + + +\layout Section + +Overview +\layout Standard + +This document describes an application programming interface for message + passing between nodes in a system area network. + The goal of this interface is to improve the scalability and performance + of network communication by defining the functions and semantics of message + passing required for scaling a parallel computing system to ten thousand + nodes. + This goal is achieved by providing an interface that will allow a quality + implementation to take advantage of the inherently scalable design of Portals. +\layout Standard + +This document is divided into several sections: +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:intro} + +\end_inset + +---Introduction This section describes the purpose and scope of the Portals + API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +---An\SpecialChar ~ +Overview\SpecialChar ~ +of\SpecialChar ~ +the\SpecialChar ~ +Portals\SpecialChar ~ +3.1\SpecialChar ~ +API This section gives a brief overview of the + Portals API. + The goal is to introduce the key concepts and terminology used in the descripti +on of the API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:api} + +\end_inset + +---The\SpecialChar ~ +Portals\SpecialChar ~ +3.2\SpecialChar ~ +API This section describes the functions and semantics of + the Portals application programming interface. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +--The\SpecialChar ~ +Semantics\SpecialChar ~ +of\SpecialChar ~ +Message\SpecialChar ~ +Transmission This section describes the semantics + of message transmission. + In particular, the information transmitted in each type of message and + the processing of incoming messages. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:examples} + +\end_inset + +---Examples This section presents several examples intended to illustrates + the use of the Portals API. + +\layout Section + +Purpose +\layout Standard + +Existing message passing technologies available for commodity cluster networking + hardware do not meet the scalability goals required by the Cplant\SpecialChar ~ + +\begin_inset LatexCommand \cite{Cplant} + +\end_inset + + project at Sandia National Laboratories. + The goal of the Cplant project is to construct a commodity cluster that + can scale to the order of ten thousand nodes. + This number greatly exceeds the capacity for which existing message passing + technologies have been designed and implemented. +\layout Standard + +In addition to the scalability requirements of the network, these technologies + must also be able to support a scalable implementation of the Message Passing + Interface (MPI)\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPIstandard} + +\end_inset + + standard, which has become the +\shape italic +de facto +\shape default + standard for parallel scientific computing. + While MPI does not impose any scalability limitations, existing message + passing technologies do not provide the functionality needed to allow implement +ations of MPI to meet the scalability requirements of Cplant. +\layout Standard + +The following are properties of a network architecture that do not impose + any inherent scalability limitations: +\layout Itemize + +Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + + and TCP/IP sockets, have limitations on the number of peer connections + that can be established. + +\layout Itemize + +Network independence - Many communication systems depend on the host processor + to perform operations in order for messages in the network to be consumed. + Message consumption from the network should not be dependent on host processor + activity, such as the operating system scheduler or user-level thread scheduler. + +\layout Itemize + +User-level flow control - Many communication systems manage flow control + internally to avoid depleting resources, which can significantly impact + performance as the number of communicating processes increases. + +\layout Itemize + +OS Bypass - High performance network communication should not involve memory + copies into or out of a kernel-managed protocol stack. + +\layout Standard + +The following are properties of a network architecture that do not impose + scalability limitations for an implementation of MPI: +\layout Itemize + +Receiver-managed - Sender-managed message passing implementations require + a persistent block of memory to be available for every process, requiring + memory resources to increase with job size and requiring user-level flow + control mechanisms to manage these resources. + +\layout Itemize + +User-level Bypass - While OS Bypass is necessary for high-performance, it + alone is not sufficient to support the Progress Rule of MPI asynchronous + operations. + +\layout Itemize + +Unexpected messages - Few communication systems have support for receiving + messages for which there is no prior notification. + Support for these types of messages is necessary to avoid flow control + and protocol overhead. + +\layout Section + +Background +\layout Standard + +Portals was originally designed for and implemented on the nCube machine + as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~ + +\begin_inset LatexCommand \cite{SUNMOS} + +\end_inset + + and Puma\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaOS} + +\end_inset + + lightweight kernel development projects. + Portals went through two design phases, the latter of which is used on + the 4500-node Intel TeraFLOPS machine\SpecialChar ~ + +\begin_inset LatexCommand \cite{TFLOPS} + +\end_inset + +. + Portals have been very successful in meeting the needs of such a large + machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaMPI} + +\end_inset + +, but also for implementing the scalable run-time environment and parallel + I/O capabilities of the machine. +\layout Standard + +The second generation Portals implementation was designed to take full advantage + of the hardware architecture of large MPP machines. + However, efforts to implement this same design on commodity cluster technology + identified several limitations, due to the differences in network hardware + as well as to shortcomings in the design of Portals. +\layout Section + +Scalability +\layout Standard + +The primary goal in the design of Portals is scalability. + Portals are designed specifically for an implementation capable of supporting + a parallel job running on tens of thousands of nodes. + Performance is critical only in terms of scalability. + That is, the level of message passing performance is characterized by how + far it allows an application to scale and not by how it performs in micro-bench +marks (e.g., a two node bandwidth or latency test). +\layout Standard + +The Portals API is designed to allow for scalability, not to guarantee it. + Portals cannot overcome the shortcomings of a poorly designed application + program. + Applications that have inherent scalability limitations, either through + design or implementation, will not be transformed by Portals into scalable + applications. + Scalability must be addressed at all levels. + Portals do not inhibit scalability, but do not guarantee it either. +\layout Standard + +To support scalability, the Portals interface maintains a minimal amount + of state. + Portals provide reliable, ordered delivery of messages between pairs of + processes. + They are connectionless: a process is not required to explicitly establish + a point-to-point connection with another process in order to communicate. + Moreover, all buffers used in the transmission of messages are maintained + in user space. + The target process determines how to respond to incoming messages, and + messages for which there are no buffers are discarded. +\layout Section + +Communication Model +\layout Standard + +Portals combine the characteristics of both one-side and two-sided communication. + They define a +\begin_inset Quotes eld +\end_inset + +matching put +\begin_inset Quotes erd +\end_inset + + operation and a +\begin_inset Quotes eld +\end_inset + +matching get +\begin_inset Quotes erd +\end_inset + + operation. + The destination of a put (or send) is not an explicit address; instead, + each message contains a set of match bits that allow the receiver to determine + where incoming messages should be placed. + This flexibility allows Portals to support both traditional one-sided operation +s and two-sided send/receive operations. +\layout Standard + +Portals allows the target to determine whether incoming messages are acceptable. + A target process can choose to accept message operations from any specific + process or can choose to ignore message operations from any specific process. +\layout Section + +Zero Copy, OS Bypass and Application Bypass +\layout Standard + +In traditional system architectures, network packets arrive at the network + interface card (NIC), are passed through one or more protocol layers in + the operating system, and eventually copied into the address space of the + application. + As network bandwidth began to approach memory copy rates, reduction of + memory copies became a critical concern. + This concern lead to the development of zero-copy message passing protocols + in which message copies are eliminated or pipelined to avoid the loss of + bandwidth. +\layout Standard + +A typical zero-copy protocol has the NIC generate an interrupt for the CPU + when a message arrives from the network. + The interrupt handler then controls the transfer of the incoming message + into the address space of the appropriate application. + The interrupt latency, the time from the initiation of an interrupt until + the interrupt handler is running, is fairly significant. + To avoid this cost, some modern NICs have processors that can be programmed + to implement part of a message passing protocol. + Given a properly designed protocol, it is possible to program the NIC to + control the transfer of incoming messages, without needing to interrupt + the CPU. + Because this strategy does not need to involve the OS on every message + transfer, it is frequently called +\begin_inset Quotes eld +\end_inset + +OS Bypass. +\begin_inset Quotes erd +\end_inset + + ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + +, FM\SpecialChar ~ + +\begin_inset LatexCommand \cite{FM2} + +\end_inset + +, GM\SpecialChar ~ + +\begin_inset LatexCommand \cite{GM} + +\end_inset + +, and Portals are examples of OS Bypass protocols. +\layout Standard + +Many protocols that support OS Bypass still require that the application + actively participate in the protocol to ensure progress. + As an example, the long message protocol of PM requires that the application + receive and reply to a request to put or get a long message. + This complicates the runtime environment, requiring a thread to process + incoming requests, and significantly increases the latency required to + initiate a long message protocol. + The Portals message passing protocol does not require activity on the part + of the application to ensure progress. + We use the term +\begin_inset Quotes eld +\end_inset + +Application Bypass +\begin_inset Quotes erd +\end_inset + + to refer to this aspect of the Portals protocol. +\layout Section + +Faults +\layout Standard + +Given the number of components that we are dealing with and the fact that + we are interested in supporting applications that run for very long times, + failures are inevitable. + The Portals API recognizes that the underlying transport may not be able + to successfully complete an operation once it has been initiated. + This is reflected in the fact that the Portals API reports three types + of events: events indicating the initiation of an operation, events indicating + the successful completion of an operation, and events indicating the unsuccessf +ul completion of an operation. + Every initiation event is eventually followed by a successful completion + event or an unsuccessful completion event. +\layout Standard + +Between the time an operation is started and the time that the operation + completes (successfully or unsuccessfully), any memory associated with + the operation should be considered volatile. + That is, the memory may be changed in unpredictable ways while the operation + is progressing. + Once the operation completes, the memory associated with the operation + will not be subject to further modification (from this operation). + Notice that unsuccessful operations may alter memory in an essentially + unpredictable fashion. +\layout Chapter + +An Overview of the Portals API +\begin_inset LatexCommand \label{sec:apiover} + +\end_inset + + +\layout Standard + +In this section, we give a conceptual overview of the Portals API. + The goal is to provide a context for understanding the detailed description + of the API presented in the next section. +\layout Section + +Data Movement +\begin_inset LatexCommand \label{sec:dmsemantics} + +\end_inset + + +\layout Standard + +A Portal represents an opening in the address space of a process. + Other processes can use a Portal to read (get) or write (put) the memory + associated with the portal. + Every data movement operation involves two processes, the +\series bold +initiator +\series default + and the +\series bold +target +\series default +. + The initiator is the process that initiates the data movement operation. + The target is the process that responds to the operation by either accepting + the data for a put operation, or replying with the data for a get operation. +\layout Standard + +In this discussion, activities attributed to a process may refer to activities + that are actually performed by the process or +\emph on +on behalf of the process +\emph default +. + The inclusiveness of our terminology is important in the context of +\emph on +application bypass +\emph default +. + In particular, when we note that the target sends a reply in the case of + a get operation, it is possible that reply will be generated by another + component in the system, bypassing the application. +\layout Standard + +Figures\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:put} + +\end_inset + + and +\begin_inset LatexCommand \ref{fig:get} + +\end_inset + + present graphical interpretations of the Portal data movement operations: + put and get. + In the case of a put operation, the initiator sends a put request message + containing the data to the target. + The target translates the Portal addressing information in the request + using its local Portal structures. + When the request has been processed, the target optionally sends an acknowledge +ment message. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename put.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Put (Send) +\begin_inset LatexCommand \label{fig:put} + +\end_inset + + +\end_inset + + +\layout Standard + +In the case of a get operation, the initiator sends a get request to the + target. + As with the put operation, the target translates the Portal addressing + information in the request using its local Portal structures. + Once it has translated the Portal addressing information, the target sends + a reply that includes the requested data. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename get.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Get +\begin_inset LatexCommand \label{fig:get} + +\end_inset + + +\end_inset + + +\layout Standard + +We should note that Portal address translations are only performed on nodes + that respond to operations initiated by other nodes. + Acknowledgements and replies to get operations bypass the portals address + translation structures. +\layout Section + +Portal Addressing +\begin_inset LatexCommand \label{subsec:paddress} + +\end_inset + + +\layout Standard + +One-sided data movement models (e.g., shmem\SpecialChar ~ + +\begin_inset LatexCommand \cite{CraySHMEM} + +\end_inset + +, ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, MPI-2\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPI2} + +\end_inset + +) typically use a triple to address memory on a remote node. + This triple consists of a process id, memory buffer id, and offset. + The process id identifies the target process, the memory buffer id specifies + the region of memory to be used for the operation, and the offset specifies + an offset within the memory buffer. +\layout Standard + +In addition to the standard address components (process id, memory buffer + id, and offset), a Portal address includes a set of match bits. + This addressing model is appropriate for supporting one-sided operations + as well as traditional two-sided message passing operations. + Specifically, the Portals API provides the flexibility needed for an efficient + implementation of MPI-1, which defines two-sided operations with one-sided + completion semantics. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:portals} + +\end_inset + + presents a graphical representation of the structures used by a target + in the interpretation of a Portal address. + The process id is used to route the message to the appropriate node and + is not reflected in this diagram. + The memory buffer id, called the +\series bold +portal id +\series default +, is used as an index into the Portal table. + Each element of the Portal table identifies a match list. + Each element of the match list specifies two bit patterns: a set of +\begin_inset Quotes eld +\end_inset + +don't care +\begin_inset Quotes erd +\end_inset + + bits, and a set of +\begin_inset Quotes eld +\end_inset + +must match +\begin_inset Quotes erd +\end_inset + + bits. + In addition to the two sets of match bits, each match list element has + at most one memory descriptor. + Each memory descriptor identifies a memory region and an optional event + queue. + The memory region specifies the memory to be used in the operation and + the event queue is used to record information about these operations. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename portals.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 305pt + lyxheight 106pt +\end_inset + + +\layout Caption + +Portal Addressing Structures +\begin_inset LatexCommand \label{fig:portals} + +\end_inset + + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + + illustrates the steps involved in translating a Portal address, starting + from the first element in a match list. + If the match criteria specified in the match list entry are met and the + memory descriptor list accepts the operation +\begin_inset Foot +collapsed true + +\layout Standard + +Memory descriptors can reject operations because a threshold has been exceeded + or because the memory region does not have sufficient space, see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + +, the operation (put or get) is performed using the memory region specified + in the memory descriptor. + If the memory descriptor specifies that it is to be unlinked when a threshold + has been exceeded, the match list entry is removed from the match list + and the resources associated with the memory descriptor and match list + entry are reclaimed. + Finally, if there is an event queue specified in the memory descriptor, + the operation is logged in the event queue. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename flow_new.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 447pt + lyxheight 282pt +\end_inset + + +\layout Caption + +Portals Address Translation +\begin_inset LatexCommand \label{fig:flow} + +\end_inset + + +\end_inset + + +\layout Standard + +If the match criteria specified in the match list entry are not met, or + there is no memory descriptor associated with the match list entry, or + the memory descriptor associated with the match list entry rejects the + operation, the address translation continues with the next match list entry. + If the end of the match list has been reached, the address translation + is aborted and the incoming requested is discarded. +\layout Section + +Access Control +\layout Standard + +A process can control access to its portals using an access control list. + Each entry in the access control list specifies a process id and a Portal + table index. + The access control list is actually an array of entries. + Each incoming request includes an index into the access control list (i.e., + a +\begin_inset Quotes eld +\end_inset + +cookie +\begin_inset Quotes erd +\end_inset + + or hint). + If the id of the process issuing the request doesn't match the id specified + in the access control list entry or the Portal table index specified in + the request doesn't match the Portal table index specified in the access + control list entry, the request is rejected. + Process identifiers and Portal table indexes may include wild card values + to increase the flexibility of this mechanism. + +\layout Standard + +Two aspects of this design merit further discussion. + First, the model assumes that the information in a message header, the + sender's id in particular, is trustworthy. + In most contexts, we assume that the entity that constructs the header + is trustworthy; however, using cryptographic techniques, we could easily + devise a protocol that would ensure the authenticity of the sender. +\layout Standard + +Second, because the access check is performed by the receiver, it is possible + that a malicious process will generate thousands of messages that will + be denied by the receiver. + This could saturate the network and/or the receiver, resulting in a +\emph on +denial of service +\emph default + attack. + Moving the check to the sender using capabilities, would remove the potential + for this form of attack. + However, the solution introduces the complexities of capability management + (exchange of capabilities, revocation, protections, etc). +\layout Section + +Multi-threaded Applications +\begin_inset LatexCommand \label{sec:threads} + +\end_inset + + +\layout Standard + +The Portals API supports a generic view of multi-threaded applications. + From the perspective of the Portals API, an application program is defined + by a set of processes. + Each process defines a unique address space. + The Portals API defines access to this address space from other processes + (using portals addressing and the data movement operations). + A process may have one or more +\emph on +threads +\emph default + executing in its address space. + +\layout Standard + +With the exception of +\emph on +PtlEQWait +\emph default + every function in the Portals API is non-blocking and atomic with respect + to both other threads and external operations that result from data movement + operations. + While individual operations are atomic, sequences of these operations may + be interleaved between different threads and with external operations. + The Portals API does not provide any mechanisms to control this interleaving. + It is expected that these mechanisms will be provided by the API used to + create threads. +\layout Chapter + +The Portals API +\begin_inset LatexCommand \label{sec:api} + +\end_inset + + +\layout Section + +Naming Conventions +\begin_inset LatexCommand \label{sec:conv} + +\end_inset + + +\layout Standard + +The Portals API defines two types of entities: functions and types. + Function always start with +\emph on +Ptl +\emph default + and use mixed upper and lower case. + When used in the body of this report, function names appear in italic face, + e.g., +\emph on +PtlInit +\emph default +. + The functions associated with an object type will have names that start + with +\emph on +Ptl +\emph default +, followed by the two letter object type code shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + As an example, the function +\emph on +PtlEQAlloc +\emph default + allocates resources for an event queue. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Object Type Codes +\begin_inset LatexCommand \label{tab:objcodes} + +\end_inset + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\newline + +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\emph on +xx +\end_inset + + +\begin_inset Text + +\layout Standard + + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +EQ +\end_inset + + +\begin_inset Text + +\layout Standard + + Event Queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + MD +\end_inset + + +\begin_inset Text + +\layout Standard + + Memory Descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + ME +\end_inset + + +\begin_inset Text + +\layout Standard + + Match list Entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + NI +\end_inset + + +\begin_inset Text + +\layout Standard + + Network Interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Type names use lower case with underscores to separate words. + Each type name starts with +\family typewriter +ptl +\family default +_ and ends with +\family typewriter +_t +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +ptl_match_bits_t +\family default +. +\layout Standard + +Names for constants use upper case with underscores to separate words. + Each constant name starts with +\family typewriter +PTL_ +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +PTL_OK +\family default +. +\layout Section + +Base Types +\layout Standard + +The Portals API defines a variety of base types. + These types represent a simple renaming of the base types provided by the + C programming language. + In most cases these new type names have been introduced to improve type + safety and to avoid issues arising from differences in representation sizes + (e.g., 16-bit or 32-bit integers). +\layout Subsection + +Sizes +\begin_inset LatexCommand \label{sec:size-t} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_size_t +\family default + is an unsigned 64-bit integral type used for representing sizes. +\layout Subsection + +Handles +\begin_inset LatexCommand \label{sec:handle-type} + +\end_inset + + +\layout Standard + +Objects maintained by the API are accessed through handles. + Handle types have names of the form +\family typewriter +ptl_handle_ +\emph on +xx +\emph default +_t +\family default +, where +\emph on +xx +\emph default + is one of the two letter object type codes shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + For example, the type +\family typewriter +ptl_handle_ni_t +\family default + is used for network interface handles. +\layout Standard + +Each type of object is given a unique handle type to enhance type checking. + The type, +\family typewriter +ptl_handle_any_t +\family default +, can be used when a generic handle is needed. + Every handle value can be converted into a value of type +\family typewriter +ptl_handle_any_t +\family default + without loss of information. +\layout Standard + +Handles are not simple values. + Every portals object is associated with a specific network interface and + an identifier for this interface (along with an object identifier) is part + of the handle for the object. +\layout Standard + +The special value +\family typewriter +PTL_EQ_NONE +\family default +, of type +\family typewriter +ptl_handle_eq_t +\family default +, is used to indicate the absence of an event queue. + See sections +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + for uses of this value. +\layout Subsection + +Indexes +\begin_inset LatexCommand \label{sec:index-type} + +\end_inset + + +\layout Standard + +The types +\family typewriter +ptl_pt_index_t +\family default + and +\family typewriter +ptl_ac_index_t +\family default + are integral types used for representing Portal table indexes and access + control tables indexes, respectively. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + + for limits on values of these types. +\layout Subsection + +Match Bits +\begin_inset LatexCommand \label{sec:mb-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_match_bits_t +\family default + is capable of holding unsigned 64-bit integer values. +\layout Subsection + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_interface_t +\family default + is an integral type used for identifying different network interfaces. + Users will need to consult the local documentation to determine appropriate + values for the interfaces available. + The special value +\family typewriter +PTL_IFACE_DEFAULT +\family default + identifies the default interface. +\layout Subsection + +Identifiers +\begin_inset LatexCommand \label{sec:id-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_nid_t +\family default + is an integral type used for representing node ids +\family typewriter +, ptl_pid_t +\family default + is an integral type for representing process ids, and +\family typewriter +ptl_uid_t +\family default +is an integral type for representing user ids. +\layout Standard + +The special values +\family typewriter +PTL_PID_ANY +\family default + matches any process identifier, PTL_NID_ANY matches any node identifier, + and +\family typewriter +PTL_UID_ANY +\family default + matches any user identifier. + See sections +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + for uses of these values. +\layout Subsection + +Status Registers +\begin_inset LatexCommand \label{sec:stat-type} + +\end_inset + + +\layout Standard + +Each network interface maintains an array of status registers that can be + accessed using the +\family typewriter +PtlNIStatus +\family default + function (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + The type +\family typewriter +ptl_sr_index_t +\family default + defines the types of indexes that can be used to access the status registers. + The only index defined for all implementations is +\family typewriter +PTL_SR_DROP_COUNT +\family default + which identifies the status register that counts the dropped requests for + the interface. + Other indexes (and registers) may be defined by the implementation. +\layout Standard + +The type +\family typewriter +ptl_sr_value_t +\family default + defines the types of values held in status registers. + This is a signed integer type. + The size is implementation dependent, but must be at least 32 bits. +\layout Section + +Initialization and Cleanup +\begin_inset LatexCommand \label{sec:init} + +\end_inset + + +\layout Standard + +The Portals API includes a function, +\emph on +PtlInit +\emph default +, to initialize the library and a function, +\emph on +PtlFini +\emph default +, to cleanup after the application is done using the library. +\layout Subsection + +PtlInit +\begin_inset LatexCommand \label{sec:ptlinit} + +\end_inset + + +\layout LyX-Code + +int PtlInit( int *max_interfaces ); +\layout Standard +\noindent +The +\emph on +PtlInit +\emph default + function initializes the Portals library. + PtlInit must be called at least once by a process before any thread makes + a Portals function call, but may be safely called more than once. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_FAIL Indicates an error during initialization. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +max_interfaces +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +max_interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the maximum number of interfaces + that can be initialized. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlFini +\begin_inset LatexCommand \label{sec:ptlfini} + +\end_inset + + +\layout LyX-Code + +void PtlFini( void ); +\layout Standard +\noindent +The +\emph on +PtlFini +\emph default + function cleans up after the Portals library is no longer needed by a process. + After this function is called, calls to any of the functions defined by + the Portal API or use of the structures set up by the Portals API will + result in undefined behavior. + This function should be called once and only once during termination by + a process. + Typically, this function will be called in the exit sequence of a process. + Individual threads should not call PtlFini when they terminate. +\layout Section + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni} + +\end_inset + + +\layout Standard + +The Portals API supports the use of multiple network interfaces. + However, each interface is treated as an independent entity. + Combining interfaces (e.g., +\begin_inset Quotes eld +\end_inset + +bonding +\begin_inset Quotes erd +\end_inset + + to create a higher bandwidth connection) must be implemented by the application + or embedded in the underlying network. + Interfaces are treated as independent entities to make it easier to cache + information on individual network interface cards. +\layout Standard + +Once initialized, each interface provides a Portal table, an access control + table, and a collection of status registers. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for a discussion of updating Portal table entries using the +\emph on +PtlMEAttach +\emph default + function. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + for a discussion of the initialization and updating of entries in the access + control table. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + for a discussion of the +\emph on +PtlNIStatus +\emph default + function which can be used to determine the value of a status register. +\layout Standard + +Every other type of Portal object (e.g., memory descriptor, event queue, or + match list entry) is associated with a specific network interface. + The association to a network interface is established when the object is + created and is encoded in the handle for the object. +\layout Standard + +Each network interface is initialized and shutdown independently. + The initialization routine, +\emph on +PtlNIInit +\emph default +, returns a handle for an interface object which is used in all subsequent + Portal operations. + The +\emph on +PtlNIFini +\emph default + function is used to shutdown an interface and release any resources that + are associated with the interface. + Network interface handles are associated with processes, not threads. + All threads in a process share all of the network interface handles. +\layout Standard + +The Portals API also defines the +\emph on +PtlNIStatus +\emph default + function to query the status registers for a network interface, the +\emph on +PtlNIDist +\emph default + function to determine the +\begin_inset Quotes eld +\end_inset + +distance +\begin_inset Quotes erd +\end_inset + + to another process, and the +\emph on +PtlNIHandle +\emph default + function to determine the network interface that an object is associated + with. +\layout Subsection + +PtlNIInit +\begin_inset LatexCommand \label{sec:niinit} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + int max_match_entries; +\newline + int max_mem_descriptors; +\newline + int max_event_queues; +\newline + ptl_ac_index_t max_atable_index; +\newline + ptl_pt_index_t max_ptable_index; +\newline +} ptl_ni_limits_t; +\newline + +\newline +int PtlNIInit( ptl_interface_t interface +\newline + ptl_pid_t pid, +\newline + ptl_ni_limits_t* desired, +\newline + ptl_ni_limits_t* actual, +\newline + ptl_handle_ni_t* handle ); +\layout Standard + +Values of type +\family typewriter +ptl_ni_limits_t +\family default + include the following members: +\layout Description + +max_match_entries Maximum number of match entries that can be allocated + at any one time. +\layout Description + +max_mem_descriptors Maximum number of memory descriptors that can be allocated + at any one time. +\layout Description + +max_event_queues Maximum number of event queues that can be allocated at + any one time. +\layout Description + +max_atable_index Largest access control table index for this interface, + valid indexes range from zero to +\family typewriter +max_atable_index +\family default +, inclusive. +\layout Description + +max_ptable_index Largest Portal table index for this interface, valid indexes + range from zero to +\family typewriter +max_ptable_index +\family default +, inclusive. +\layout Standard +\noindent +The +\emph on +PtlNIInit +\emph default + function is used to initialized the Portals API for a network interface. + This function must be called at least once by each process before any other + operations that apply to the interface by any process or thread. + For subsequent calls to +\shape italic +PtlNIInit +\shape default + from within the same process (either by different threads or the same thread), + the desired limits will be ignored and the call will return the existing + NI handle. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INIT_DUP Indicates a duplicate initialization of +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INIT_INV Indicates that +\family typewriter +interface +\family default + is not a valid network interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to initialize the + interface. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +pid +\family default + is not a valid process id. +\layout Description + +PTL_SEGV Indicates that +\family typewriter +actual +\family default +or +\family typewriter + handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the network interface to be initialized. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + for a discussion of values used to identify network interfaces.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +pid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the desired process id (for well known process ids). + The value +\family typewriter +PTL_PID_ANY +\family default + may be used to have the process id assigned by the underlying library. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +desired +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If non-NULL, points to a structure that holds the desired limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +actual +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, the location pointed to by actual will hold the actual + limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the interface. +\end_inset + + + + +\end_inset + + +\layout Comment + +The use of desired is implementation dependent. + In particular, an implementation may choose to ignore this argument. +\layout Subsection + +PtlNIFini +\begin_inset LatexCommand \label{sec:nifini} + +\end_inset + + +\layout LyX-Code + +int PtlNIFini( ptl_handle_ni_t interface ); +\layout Standard +\noindent +The +\emph on +PtlNIFini +\emph default + function is used to release the resources allocated for a network interface. + Once the +\emph on +PtlNIFini +\emph default + operation has been started, the results of pending API operations (e.g., + operations initiated by another thread) for this interface are undefined. + Similarly, the effects of incoming operations (puts and gets) or return + values (acknowledgements and replies) for this interface are undefined. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the interface to shutdown. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlNIStatus +\begin_inset LatexCommand \label{sec:nistatus} + +\end_inset + + +\layout LyX-Code + +int PtlNIStatus( ptl_handle_ni_t interface, +\newline + ptl_sr_index_t status_register, +\newline + ptl_sr_value_t* status ); +\layout Standard +\noindent +The +\emph on +PtlNIStatus +\emph default + function returns the value of a status register for the specified interface. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + for more information on status register indexes and status register values.) +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_SR_INDX Indicates that +\family typewriter +status_register +\family default + is not a valid status register. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +status +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status_register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An index for the status register to read. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the current value of the status + register. +\end_inset + + + + +\end_inset + + +\layout Comment + +The only status register that must be defined is a drop count register ( +\family typewriter +PTL_SR_DROP_COUNT +\family default +). + Implementations may define additional status registers. + Identifiers for the indexes associated with these registers should start + with the prefix +\family typewriter +PTL_SR_ +\family default +. +\layout Subsection + +PtlNIDist +\layout LyX-Code + +int PtlNIDist( ptl_handle_ni_t interface, +\newline + ptl_process_id_t process, +\newline + unsigned long* distance ); +\layout Standard +\noindent +The +\emph on +PtlNIDist +\emph default + function returns the distance to another process using the specified interface. + Distances are only defined relative to an interface. + Distance comparisons between different interfaces on the same process may + be meaningless. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +process +\family default + is not a valid process identifier. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +distance +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An identifier for the process whose distance is being requested. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +distance +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the distance to the remote + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +This function should return a static measure of distance. + Examples include minimum latency, the inverse of available bandwidth, or + the number of switches between the two endpoints. +\layout Subsection + +PtlNIHandle +\layout LyX-Code + +int PtlNIHandle( ptl_handle_any_t handle, +\newline + ptl_handle_ni_t* interface ); +\layout Standard +\noindent +The +\emph on +PtlNIHandle +\emph default + function returns a handle for the network interface with which the object + identified by +\family typewriter +handle +\family default + is associated. + If the object identified by +\family typewriter +handle +\family default + is a network interface, this function returns the same value it is passed. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_HANDLE Indicates that +\family typewriter +handle +\family default + is not a valid handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the object. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the network interface + associated with +\family typewriter +handle +\family default +. +\end_inset + + + + +\end_inset + + +\layout Comment + +Every handle should encode the network interface and the object id relative + to this handle. + Both are presumably encoded using integer values. +\layout Section + +User Identification +\begin_inset LatexCommand \label{sec:uid} + +\end_inset + + +\layout Standard + +Every process runs on behalf of a user. + +\layout Subsection + +PtlGetUid +\layout LyX-Code + +int PtlGetUid( ptl_handle_ni_t ni_handle, +\newline + ptl_uid_t* uid ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the user id for the calling + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that user identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, a process may have multiple + user identifiers. +\layout Section + +Process Identification +\begin_inset LatexCommand \label{sec:pid} + +\end_inset + + +\layout Standard + +Processes that use the Portals API, can be identified using a node id and + process id. + Every node accessible through a network interface has a unique node identifier + and every process running on a node has a unique process identifier. + As such, any process in the computing system can be identified by its node + id and process id. + +\layout Standard + +The Portals API defines a type, +\family typewriter +ptl_process_id_t +\family default + for representing process ids and a function, +\emph on +PtlGetId +\emph default +, which can be used to obtain the id of the current process. +\layout Comment + +The portals API does not include thread identifiers. + Messages are delivered to processes (address spaces) not threads (contexts + of execution). +\layout Subsection + +The Process Id Type +\begin_inset LatexCommand \label{sec:pid-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_nid_t nid; /* node id */ +\newline + ptl_pid_t pid; /* process id */ +\newline +} ptl_process_id_t; +\layout Standard +\noindent +The +\family typewriter +ptl_process_id_t +\family default + type uses two identifiers to represent a process id: a node id and a process + id. + +\layout Subsection + +PtlGetId +\begin_inset LatexCommand \label{sub:PtlGetId} + +\end_inset + + +\layout LyX-Code + +int PtlGetId( ptl_handle_ni_t ni_handle, +\newline + ptl_process_id_t* id ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +id +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the id for the calling process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that process identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, it may have multiple + node identifiers. +\layout Section + +Match List Entries and Match Lists +\begin_inset LatexCommand \label{sec:me} + +\end_inset + + +\layout Standard + +A match list is a chain of match list entries. + Each match list entry includes a memory descriptor and a set of match criteria. + The match criteria can be used to reject incoming requests based on process + id or the match bits provided in the request. + A match list is created using the +\emph on +PtlMEAttach +\emph default + or +\shape italic +PtlMEAttachAny +\shape default + functions, which create a match list consisting of a single match list + entry, attaches the match list to the specified Portal index, and returns + a handle for the match list entry. + Match entries can be dynamically inserted and removed from a match list + using the +\emph on +PtlMEInsert +\emph default + and +\emph on +PtlMEUnlink +\emph default + functions. +\layout Subsection + +PtlMEAttach +\begin_inset LatexCommand \label{sec:meattach} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t; +\newline + +\layout LyX-Code + +typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t; +\newline + +\layout LyX-Code + +int PtlMEAttach( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ins_pos_t +\family default + are used to control where a new item is inserted. + The value +\family typewriter +PTL_INS_BEFORE +\family default + is used to insert the new item before the current item or before the head + of the list. + The value +\family typewriter +PTL_INS_AFTER +\family default + is used to insert the new item after the current item or after the last + item in the list. + +\layout Standard + +The +\emph on +PtlMEAttach +\emph default + function creates a match list consisting of a single entry and attaches + this list to the Portal table for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PTINDEX Indicates that +\family typewriter +index +\family default + is not a valid Portal table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The Portal table index where the match list should be attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specifies the match criteria for the process id of the requestor. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +match_bits, ignorebits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specify the match criteria to apply to the match bits in the incoming request. + The +\family typewriter +ignorebits +\family default + are used to mask out insignificant bits in the incoming match bits. + The resulting bits are then compared to the match list entry's match + bits to determine if the incoming request meets the match criteria. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates the match list entry should be unlinked when the last memory descripto +r associated with this match list entry is unlinked. + (Note, the check for unlinking a match entry only occurs when a memory + descriptor is unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be prepended or appended to + the existing match list. + If there is no existing list, this argument is ignored and the new match + entry becomes the only entry in the list. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEAttachAny +\begin_inset LatexCommand \label{sec:attachany} + +\end_inset + + +\layout LyX-Code + +int PtlMEAttachAny( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t *index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEAttachAny +\emph default + function creates a match list consisting of a single entry and attaches + this list to an unused Portal table entry for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_PT_FULL Indicates that there are no free entries in the Portal table. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On succesfful return, this location will hold the Portal index where the + match list has been attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid, match_bits, ignorebits, unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\shape italic +PtlMEAttach +\shape default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEInsert +\begin_inset LatexCommand \label{sec:meinsert} + +\end_inset + + +\layout LyX-Code + +int PtlMEInsert( ptl_handle_me_t current, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEInsert +\emph default + function creates a new match list entry and inserts this entry into the + match list containing +\family typewriter +current +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +current +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match entry. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +current +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for a match entry. + The new match entry will be inserted immediately before or immediately + after this match entry. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\family default +, +\family typewriter +match_bits +\family default +, +\family typewriter +ignorebits +\family default +, +\family typewriter +unlink +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be inserted before or after + the +\family typewriter +current +\family default + entry. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default +. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEUnlink +\begin_inset LatexCommand \label{sec:meunlink} + +\end_inset + + +\layout LyX-Code + +int PtlMEUnlink( ptl_handle_me_t entry ); +\layout Standard +\noindent +The +\emph on +PtlMEUnlink +\emph default + function can be used to unlink a match entry from a match list. + This operation also releases any resources associated with the match entry + (including the associated memory descriptor). + It is an error to use the match entry handle after calling +\emph on +PtlMEUnlink +\emph default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +entry +\family default + is not a valid match entry handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the match entry to be unlinked. +\end_inset + + + + +\end_inset + + +\layout Section + +Memory Descriptors +\begin_inset LatexCommand \label{sec:md} + +\end_inset + + +\layout Standard + +A memory descriptor contains information about a region of an application + process' memory and an event queue where information about the operations + performed on the memory descriptor are recorded. + The Portals API provides two operations to create memory descriptors: +\emph on +PtlMDAttach +\emph default +, and +\emph on +PtlMDBind +\emph default +; an operation to update a memory descriptor, +\emph on +PtlMDUpdate +\emph default +; and an operation to unlink and release the resources associated with a + memory descriptor, +\emph on +PtlMDUnlink +\emph default +. +\layout Subsection + +The Memory Descriptor Type +\begin_inset LatexCommand \label{sec:md-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + void* start; +\newline + ptl_size_t length; +\newline + int threshold; +\newline + unsigned int max_offset; +\newline + unsigned int options; +\newline + void* user_ptr; +\newline + ptl_handle_eq_t eventq; +\newline +} ptl_md_t; +\layout Standard +\noindent +The +\family typewriter +ptl_md_t +\family default + type defines the application view of a memory descriptor. + Values of this type are used to initialize and update the memory descriptors. +\layout Subsubsection + +Members +\layout Description + +start,\SpecialChar ~ +length Specify the memory region associated with the memory descriptor. + The +\family typewriter +start +\family default + member specifies the starting address for the memory region and the +\family typewriter +length +\family default + member specifies the length of the region. + The +\family typewriter +start member +\family default + can be NULL provided that the +\family typewriter +length +\family default + member is zero. + (Zero length buffers are useful to record events.) There are no alignment + restrictions on the starting address or the length of the region; although, + unaligned messages may be slower (i.e., lower bandwidth and/or longer latency) + on some implementations. + +\layout Description + +threshold Specifies the maximum number of operations that can be performed + on the memory descriptor. + An operation is any action that could possibly generate an event (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + for the different types of events). + In the usual case, the threshold value is decremented for each operation + on the memory descriptor. + When the threshold value is zero, the memory descriptor is +\emph on +inactive +\emph default +, and does not respond to operations. + A memory descriptor can have an initial threshold value of zero to allow + for manipulation of an inactive memory descriptor by the local process. + A threshold value of +\family typewriter +PTL_MD_THRESH_INF +\family default + indicates that there is no bound on the number of operations that may be + applied to a memory descriptor. + Note that local operations (e.g., +\emph on +PtlMDUpdate +\emph default +) are not applied to the threshold count. + +\layout Description + +max_offset Specifies the maximum local offset of a memory descriptor. + When the local offset of a memory descriptor exceeds this maximum, the + memory descriptor becomes +\shape italic +inactive +\shape default + and does not respond to further operations. +\layout Description + +options Specifies the behavior of the memory descriptor. + There are five options that can be selected: enable put operations (yes + or no), enable get operations (yes or no), offset management (local or + remote), message truncation (yes or no), and acknowledgement (yes or no). + Values for this argument can be constructed using a bitwise or of the following + values: +\begin_deeper +\begin_deeper +\layout Description + +PTL_MD_OP_PUT Specifies that the memory descriptor will respond to +\emph on +put +\emph default + operations. + By default, memory descriptors reject +\emph on +put +\emph default + operations. + +\layout Description + +PTL_MD_OP_GET Specifies that the memory descriptor will respond to +\emph on +get +\emph default + operations. + By default, memory descriptors reject +\emph on +get +\emph default + operations. + +\layout Description + +PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory + region is provided by the incoming request. + By default, the offset is maintained locally. + When the offset is maintained locally, the offset is incremented by the + length of the request so that the next operation (put and/or get) will + access the next part of the memory region. +\layout Description + +PTL_MD_TRUNCATE Specifies that the length provided in the incoming request + can be reduced to match the memory available in the region. + (The memory available in a memory region is determined by subtracting the + offset from the length of the memory region.) By default, if the length + in the incoming operation is greater than the amount of memory available, + the operation is rejected. + +\layout Description + +PTL_MD_ACK_DISABLE Specifies that an acknowledgement should +\emph on +not +\emph default + be sent for incoming +\emph on +put +\emph default + operations, even if requested. + By default, acknowledgements are sent for +\emph on +put +\emph default + operations that request an acknowledgement. + Acknowledgements are never sent for +\emph on +get +\emph default + operations. + The value sent in the reply serves as an implicit acknowledgement. + +\end_deeper +\layout Standard + + +\series bold +Note +\series default +: It is not considered an error to have a memory descriptor that does not + respond to either +\emph on +put +\emph default + or +\emph on +get +\emph default + operations: Every memory descriptor responds to +\emph on +reply +\emph default + operations. + Nor is it considered an error to have a memory descriptor that responds + to both +\emph on +put +\emph default + and +\emph on +get +\emph default + operations. + +\end_deeper +\layout Description + +user_ptr A user-specified value that is associated with the memory descriptor. + The value does not need to be a pointer, but must fit in the space used + by a pointer. + This value (along with other values) is recorded in events associated with + operations on this memory descriptor. +\begin_inset Foot +collapsed true + +\layout Standard + +Tying the memory descriptor to a user-defined value can be useful when multiple + memory descriptor share the same event queue or when the memory descriptor + needs to be associated with a data structure maintained by the application. + For example, an MPI implementation can set the +\family typewriter +user_ptr +\family default + argument to the value of an MPI Request. + This direct association allows for processing of memory descriptor's by + the MPI implementation without a table lookup or a search for the appropriate + MPI Request. +\end_inset + + +\layout Description + +eventq A handle for the event queue used to log the operations performed + on the memory region. + If this argument is +\family typewriter +PTl_EQ_NONE +\family default +, operations performed on this memory descriptor are not logged. + +\layout Subsection + +PtlMDAttach +\begin_inset LatexCommand \label{sec:mdattach} + +\end_inset + + +\layout LyX-Code + +int PtlMDAttach( ptl_handle_me_t match, +\newline + ptl_md_t mem_desc, +\newline + ptl_unlink_t unlink_op, +\newline + ptl_unlink_t unlink_nofit, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_unlink_t +\family default + are used to control whether an item is unlinked from a list. + The value +\family typewriter +PTL_UNLINK +\family default + enables unlinking. + The value +\family typewriter +PTL_RETAIN +\family default + disables unlinking. +\layout Standard + +The +\emph on +PtlMDAttach +\emph default + operation is used to create a memory descriptor and attach it to a match + list entry. + An error code is returned if this match list entry already has an associated + memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INUSE Indicates that +\family typewriter +match +\family default + already has a memory descriptor attached. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +match +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface associated with +\family typewriter +match +\family default +. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the match entry that the memory descriptor will be associated + with. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_op +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when it becomes + inactive, either because the operation threshold drops to zero or because + the maximum offset has been exceeded. + (Note, the check for unlinking a memory descriptor only occurs after a + the completion of a successful operation. + If the threshold is set to zero during initialization or using +\emph on +PtlMDUpdate +\emph default +, the memory descriptor is +\series bold +not +\series default + unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_nofit +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when the space + remaining in the memory descriptor is not sufficient for a matching operation. + If an incoming message arrives arrives at a memory descriptor that does + not have sufficient space and the +\series bold +PTL_MD_TRUNCATE +\series default + operation is not specified, the memory descriptor will be unlinked. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument can be NULL, in which case the handle will not be returned. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDBind +\begin_inset LatexCommand \label{sec:mdbind} + +\end_inset + + +\layout LyX-Code + +int PtlMDBind( ptl_handle_ni_t interface, +\newline + ptl_md_t mem_desc, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlMDBind +\emph default + operation is used to create a +\begin_inset Quotes eld +\end_inset + +free floating +\begin_inset Quotes erd +\end_inset + + memory descriptor, i.e., a memory descriptor that is not associated with + a match list entry. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface, +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INV_EQ Indicates that the event queue associated with +\family typewriter +mem_desc +\family default + is not valid. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the network interface with which the memory descriptor will + be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument must be a valid address and cannot be NULL. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUnlink +\begin_inset LatexCommand \label{sec:mdfree} + +\end_inset + + +\layout LyX-Code + +int PtlMDUnlink( ptl_handle_md_t mem_desc ); +\layout Standard +\noindent +The +\emph on +PtlMDUnlink +\emph default + function unlinks the memory descriptor from any match list entry it may + be linked to and releases the resources associated with a memory descriptor. + (This function does not free the memory region associated with the memory + descriptor.) This function also releases the resources associated with a + floating memory descriptor. + Only memory descriptors with no pending operations may be unlinked. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. +\layout Description + +PTL_MD_INUSE Indicates that +\family typewriter +mem_desc +\family default + has pending operations and cannot be unlinked. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUpdate +\begin_inset LatexCommand \label{sec:mdupdate} + +\end_inset + + +\layout LyX-Code + +int PtlMDUpdate( ptl_handle_md_t mem_desc, +\newline + ptl_md_t* old_md, +\newline + ptl_md_t* new_md, +\newline + ptl_handle_eq_t testq ); +\layout Standard +\noindent +The +\emph on +PtlMDUpdate +\emph default + function provides a conditional, atomic update operation for memory descriptors. + The memory descriptor identified by +\family typewriter +mem_desc +\family default + is only updated if the event queue identified by +\family typewriter +testq +\family default + is empty. + The intent is to only enable updates to the memory descriptor when no new + messages have arrived since the last time the queue was checked. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + + for an example of how this function can be used. +\layout Standard + +If +\family typewriter +new +\family default + is not NULL the memory descriptor identified by handle will be updated + to reflect the values in the structure pointed to by +\family typewriter +new +\family default + if +\family typewriter +testq +\family default + has the value +\family typewriter +PTL_EQ_NONE +\family default + or if the event queue identified by +\family typewriter +testq +\family default + is empty. + If +\family typewriter +old +\family default + is not NULL, the current value of the memory descriptor identified by +\family typewriter +mem_desc +\family default + is recorded in the location identified by +\family typewriter +old +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_NOUPDATE Indicates that the update was not performed because +\family typewriter +testq +\family default + was not empty. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. + +\layout Description + +PTL_ILL_MD Indicates that the value pointed to by +\family typewriter +new +\family default + is not a legal memory descriptor (e.g., the memory region specified by the + memory descriptor may be invalid). + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +testq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +new +\family default + or +\family typewriter +old +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +old_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +old_md +\family default + is not the value +\family typewriter +NULL +\family default +, the current value of the memory descriptor will be stored in the location + identified by +\family typewriter +old +\family default +_md. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +new_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +new_md +\family default + is not the value +\family typewriter +NULL +\family default +, this argument provides the new values for the memory descriptor, if the + update is performed. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +testq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for an event queue used to predicate the update. + If +\family typewriter +testq +\family default + is equal to +\family typewriter +PTL_EQ_NONE +\family default +, the update is performed unconditionally. + Otherwise, the update is performed if and only if +\family typewriter +testq +\family default + is empty. + If the update is not performed, the function returns the value +\family typewriter +PTL_NOUPDATE +\family default +. + (Note, the +\family typewriter +testq +\family default + argument does not need to be the same as the event queue associated with + the memory descriptor.) +\end_inset + + + + +\end_inset + + +\layout Standard + +The conditional update can be used to ensure that the memory descriptor + has not changed between the time it was examined and the time it is updated. + In particular, it is needed to support an MPI implementation where the + activity of searching an unexpected message queue and posting a receive + must be atomic. +\layout Section + +Events and Event Queues +\begin_inset LatexCommand \label{sec:eq} + +\end_inset + + +\layout Standard + +Event queues are used to log operations performed on memory descriptors. + They can also be used to hold acknowledgements for completed +\emph on +put +\emph default + operations and to note when the data specified in a +\emph on +put +\emph default + operation has been sent (i.e., when it is safe to reuse the buffer that holds + this data). + Multiple memory descriptors can share a single event queue. +\layout Standard + +In addition to the +\family typewriter +ptl_handle_eq_t +\family default + type, the Portals API defines two types associated with events: The +\family typewriter + +\newline +ptl_event_kind_t +\family default + type defines the kinds of events that can be stored in an event queue. + The +\family typewriter +ptl_event_t +\family default + type defines a structure that holds the information associated with an + event. +\layout Standard + +The Portals API also provides four functions for dealing with event queues: + The +\emph on +PtlEQAlloc +\emph default + function is used to allocate the API resources needed for an event queue, + the +\emph on +PtlEQFree +\emph default + function is used to release these resources, the +\emph on +PtlEQGet +\emph default + function can be used to get the next event from an event queue, and the + +\emph on +PtlEQWait +\emph default + function can be used to block a process (or thread) until an event queue + has at least one event. +\layout Subsection + +Kinds of Events +\begin_inset LatexCommand \label{sec:ek-type} + +\end_inset + + +\layout LyX-Code + +typedef enum { +\newline + PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL, +\newline + PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL, +\newline + PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL, +\newline + PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL, +\newline + PTL_EVENT_ACK, +\newline + PTL_EVENT_UNLINK +\newline +} ptl_event_kind_t; +\layout Standard +\noindent +The Portals API defines fourteen types of events that can be logged in an + event queue: +\layout Description + +PTL_EVENT_GET_START A remote +\emph on +get +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_GET_END A previously initiated +\emph on +get +\emph default + operation completed successfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_GET_FAIL A previously initiated +\emph on +get +\emph default + operation completed unsuccessfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_PUT_START A remote +\emph on +put +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should should be considered + volatile until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_PUT_END A previously initiated +\emph on +put +\emph default + operation completed successfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_PUT_FAIL A previously initiated +\emph on +put +\emph default + operation completed unsuccessfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_REPLY_START A +\emph on +reply +\emph default + operation has been started on the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_END A previously initiated +\emph on +reply +\emph default + operation has completed successfully . + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_FAIL A previously initiated +\emph on +reply +\emph default + operation has completed unsuccessfully. + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_ACK An +\emph on +acknowledgement +\emph default + was received. + This event is logged when the acknowledgement is received +\layout Description + +PTL_EVENT_SEND_START An outgoing +\emph on +send +\emph default + operation has been started. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_SEND_END A previously initiated +\emph on +send +\emph default + operation has completed successfully. + This event is logged after the entire buffer has been sent and it is safe + for the application to reuse the buffer. + +\layout Description + +PTL_EVENT_SEND_FAIL A previously initiated +\emph on +send +\emph default + operation has completed unsuccessfully. + The process can safely manipulate the memory or free the memory descriptor + once it sees this event. +\layout Description + +PTL_EVENT_UNLINK A memory descriptor associated with this event queue has + been automatically unlinked. + This event is not generated when a memory descriptor is explicitly unlinked + by calling +\shape italic +PtlMDUnlink +\shape default +. + This event does not decrement the threshold count. +\layout Subsection + +Event Ordering +\layout Standard + +The Portals API guarantees that a when a process initiates two operations + on a remote process, the operations will be initiated on the remote process + in the same order that they were initiated on the original process. + As an example, if process A intitates two +\emph on +put +\emph default + operations, +\emph on +x +\emph default + and +\emph on +y +\emph default +, on process B, the Portals API guarantees that process A will receive the + +\family typewriter +PTL_EVENT_SEND_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default + in the same order that process B receives the +\family typewriter +PTL_EVENT_PUT_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default +. + Notice that the API does not guarantee that the start events will be delivered + in the same order that process A initiated the +\emph on +x +\emph default + and +\emph on +y +\emph default + operations. + If process A needs to ensure the ordering of these operations, it should + include code to wait for the initiation of +\emph on +x +\emph default + before it initiates +\emph on +y +\emph default +. +\layout Subsection + +Failure Notification +\layout Standard + +Operations may fail to complete successfully; however, unless the node itself + fails, every operation that is started will eventually complete. + While an operation is in progress, the memory associated with the operation + should not be viewed (in the case of a put or a reply) or altered (in the + case of a send or get). + Operation completion, whether successful or unsuccessful, is final. + That is, when an operation completes, the memory associated with the operation + will no longer be read or altered by the operation. + A network interface can use the +\family typewriter +ptl_ni_fail_t +\family default + to define more specific information regarding the failure of the operation + and record this information in the +\family typewriter +ni_fail_type +\family default + field of the event. +\layout Subsection + +The Event Type +\begin_inset LatexCommand \label{sec:event-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_event_kind_t type; +\newline + ptl_process_id_t initiator; +\newline + ptl_uid_t uid; +\layout LyX-Code + + ptl_pt_index_t portal; +\newline + ptl_match_bits_t match_bits; +\newline + ptl_size_t rlength; +\newline + ptl_size_t mlength; +\newline + ptl_size_t offset; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_hdr_data_t hdr_data; +\newline + ptl_seq_t link; +\newline + ptl_ni_fail_t ni_fail_type; +\newline + volatile ptl_seq_t sequence; +\newline +} ptl_event_t; +\layout Standard +\noindent +An event structure includes the following members: +\layout Description + +type Indicates the type of the event. + +\layout Description + +initiator The id of the initiator. + +\layout Description + +portal The Portal table index specified in the request. + +\layout Description + +match_bits A copy of the match bits specified in the request. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for more information on match bits. + +\layout Description + +rlength The length (in bytes) specified in the request. + +\layout Description + +mlength The length (in bytes) of the data that was manipulated by the operation. + For truncated operations, the manipulated length will be the number of + bytes specified by the memory descriptor (possibly with an offset) operation. + For all other operations, the manipulated length will be the length of + the requested operation. + +\layout Description + +offset Is the displacement (in bytes) into the memory region that the operation + used. + The offset can be determined by the operation (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + +) for a remote managed memory descriptor, or by the local memory descriptor + (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +). + +\layout Description + +md_handle Is the handle to the memory descriptor associated with the event. +\layout Description + +mem_desc Is the state of the memory descriptor immediately after the event + has been processed. + +\layout Description + +hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +). + +\layout Description + +link The +\emph on +link +\emph default + member is used to link +\family typewriter +START +\family default + events with the +\family typewriter +END +\family default + or +\family typewriter +FAIL +\family default + event that signifies completion of the operation. + The +\emph on +link +\emph default + member will be the same for the two events associated with an operation. + The link member is also used to link an +\family typewriter +UNLINK +\family default + event with the event that caused the memory descriptor to be unlinked. +\layout Description + +sequence The sequence number for this event. + Sequence numbers are unique to each event. +\layout Comment + +The +\emph on +sequence +\emph default + member is the last member and is volatile to support SMP implementations. + When an event structure is filled in, the +\emph on +sequence +\emph default + member should be written after all other members have been updated. + Moreover, a memory barrier should be inserted between the updating of other + members and the updating of the +\emph on +sequence +\emph default + member. +\layout Subsection + +PtlEQAlloc +\begin_inset LatexCommand \label{sec:eqalloc} + +\end_inset + + +\layout LyX-Code + +int PtlEQAlloc( ptl_handle_ni_t interface, +\newline + ptl_size_t count, +\newline + ptl_handle_eq_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlEQAlloc +\emph default + function is used to build an event queue. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + event queue. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface with which the event queue will be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +count +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The number of events that can be stored in the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQFree +\begin_inset LatexCommand \label{sec:eqfree} + +\end_inset + + +\layout LyX-Code + +int PtlEQFree( ptl_handle_eq_t eventq ); +\layout Standard +\noindent +The +\emph on +PtlEQFree +\emph default + function releases the resources associated with an event queue. + It is up to the user to insure that no memory descriptors are associated + with the event queue once it is freed. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the event queue to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQGet +\begin_inset LatexCommand \label{sec:eqget} + +\end_inset + + +\layout LyX-Code + +int PtlEQGet( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQGet +\emph default + function is a nonblocking function that can be used to get the next event + in an event queue. + The event is removed from the queue. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_EQ_EMPTY Indicates that +\family typewriter +eventq +\family default + is empty or another thread is waiting on +\emph on +PtlEQWait +\emph default +. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQWait +\begin_inset LatexCommand \label{sec:eqwait} + +\end_inset + + +\layout LyX-Code + +int PtlEQWait( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQWait +\emph default + function can be used to block the calling process (thread) until there + is an event in an event queue. + This function also returns the next event in the event queue and removes + this event from the queue. + This is the only blocking operation in the Portals 3.2 API. + In the event that multiple threads are waiting on the same event queue, + PtlEQWait is guaranteed to wake exactly one thread, but the order in which + they are awakened is not specified. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + queue handle. + +\layout Subsubsection + +Arguments +\layout Standard +\noindent + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue to wait on. + The calling process (thread) will be blocked until +\family typewriter +eventq +\family default + is not empty. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Section + +The Access Control Table +\begin_inset LatexCommand \label{sec:ac} + +\end_inset + + +\layout Standard + +Processes can use the access control table to control which processes are + allowed to perform operations on Portal table entries. + Each communication interface has a Portal table and an access control table. + The access control table for the default interface contains an entry at + index zero that allows all processes with the same user id to communicate. + Entries in the access control table can be manipulated using the +\emph on +PtlACEntry +\emph default + function. +\layout Subsection + +PtlACEntry +\begin_inset LatexCommand \label{sec:acentry} + +\end_inset + + +\layout LyX-Code + +int PtlACEntry( ptl_handle_ni_t interface, +\newline + ptl_ac_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_uid_t user_id, +\newline + ptl_pt_index_t portal ); +\layout Standard +\noindent +The +\emph on +PtlACEntry +\emph default + function can be used to update an entry in the access control table for + an interface. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_AC_INV_INDEX Indicates that +\family typewriter +index +\family default + is not a valid access control table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_PT_INV_INDEX Indicates that +\family typewriter +portal +\family default + is not a valid Portal table index. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the interface to use. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index of the entry in the access control table to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the process(es) that are allowed to perform operations. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +user_id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the user that is allowed to perform operations. + The value +\family typewriter +PTL_UID_ANY +\family default + can be used to wildcard the user. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the Portal index(es) that can be used. + The value +\family typewriter +PTL_PT_INDEX_ANY +\family default + can be used to wildcard the Portal index. +\end_inset + + + + +\end_inset + + +\layout Section + +Data Movement Operations +\begin_inset LatexCommand \label{sec:datamovement} + +\end_inset + + +\layout Standard + +The Portals API provides two data movement operations: +\emph on +PtlPut +\emph default + and +\emph on +PtlGet +\emph default +. +\layout Subsection + +PtlPut +\begin_inset LatexCommand \label{sec:put} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t; +\newline + +\newline +int PtlPut( ptl_handle_md_t mem_desc, +\newline + ptl_ack_req_t ack_req, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset, +\newline + ptl_hdr_data_t hdr_data ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ack_req_t +\family default + are used to control whether an acknowledgement should be sent when the + operation completes (i.e., when the data has been written to a memory descriptor + of the +\family typewriter +target +\family default + process). + The value +\family typewriter +PTL_ACK_REQ +\family default + requests an acknowledgement, the value +\family typewriter +PTL_NOACK_REQ +\family default + requests that no acknowledgement should be generated. +\layout Standard + +The +\emph on +PtlPut +\emph default + function initiates an asynchronous put operation. + There are several events associated with a put operation: initiation of + the send on the local node ( +\family typewriter +PTL_EVENT_SEND_START +\family default +), completion of the send on the local node ( +\family typewriter +PTL_EVENT_SEND_END +\family default + or +\family typewriter +PTL_EVENT_SEND_FAIL +\family default +), and, when the send completes successfully, the receipt of an acknowledgement + ( +\family typewriter +PTL_EVENT_ACK +\family default +) indicating that the operation was accepted by the target. + These events will be logged in the event queue associated with the memory + descriptor ( +\family typewriter +mem_desc +\family default +) used in the put operation. + Using a memory descriptor that does not have an associated event queue + results in these events being discarded. + In this case, the application must have another mechanism (e.g., a higher + level protocol) for determining when it is safe to modify the memory region + associated with the memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory to be sent. + If the memory descriptor has an event queue associated with it, it will + be used to record events when the message has been sent (PTL_EVENT_SEND_START, + PTL_EVENT_SEND_END). + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ack_req +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Controls whether an acknowledgement event is requested. + Acknowledgements are only sent when they are requested by the initiating + process +\series bold +and +\series default + the memory descriptor has an event queue +\series bold +and +\series default + the target memory descriptor enables them. + Allowed constants: +\family typewriter +PTL_ACK_REQ +\family default +, +\family typewriter +PTL_NOACK_REQ +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +hdr_data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +64 bits of user data that can be included in message header. + This data is written to an event queue entry at the target if an event + queue is present on the matching memory descriptor. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlGet +\begin_inset LatexCommand \label{sec:get} + +\end_inset + + +\layout LyX-Code + +int PtlGet( ptl_handle_md_t mem_desc, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset ); +\layout Standard +\noindent +The +\emph on +PtlGet +\emph default + function initiates a remote read operation. + There are two event pairs associated with a get operation , when the data + is sent from the remote node, a +\family typewriter +PTL_EVENT_GET{START|END} +\family default + event pair is registered on the remote node; and when the data is returned + from the remote node a +\family typewriter +PTL_EVENT_REPLY{START|END} +\family default + event pair is registered on the local node. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory into which + the requested data will be received. + The memory descriptor can have an event queue associated with it to record + events, such as when the message receive has started ( +\family typewriter +PTL_EVENT_REPLY +\family default +_ +\family typewriter +START +\family default +). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\end_inset + + +\layout Section + +Summary +\layout Standard + + +\begin_inset LatexCommand \label{sec:summary} + +\end_inset + + We conclude this section by summarizing the names introduced by the Portals + 3.2 API. + We start by summarizing the names of the types introduced by the API. + This is followed by a summary of the functions introduced by the API. + Which is followed by a summary of the function return codes. + Finally, we conclude with a summary of the other constant values introduced + by the API. +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + + presents a summary of the types defined by the Portals API. + The first column in this table gives the type name, the second column gives + a brief description of the type, the third column identifies the section + where the type is defined, and the fourth column lists the functions that + have arguments of this type. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Types Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:types} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\noindent + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Sect +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Functions +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlACEntry, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +acknowledgement request types +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlPut +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +kinds of events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +information about events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +plt_seq_t +\end_inset + + +\begin_inset Text + +\layout Standard + +event sequence number +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_any_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for any object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for event queues +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert, + PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_me_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for match entries +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_ni_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut, + PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +node identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlGetId,PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +process identifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetId, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +user indentifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetUid, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +insertion position (before or after) +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +identifiers for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +match (and ignore) bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mb-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ni_fail_t +\end_inset + + +\begin_inset Text + +\layout Standard + +network interface-specific failures +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +process identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for Portal tables +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +sizes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:size-t} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_value_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +values in status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +unlink options +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + + presents a summary of the functions defined by the Portals API. + The first column in this table gives the name for the function, the second + column gives a brief description of the operation implemented by the function, + and the third column identifies the section where the function is defined. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Functions Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:func} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Operation +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlACEntry +\end_inset + + +\begin_inset Text + +\layout Standard + + update an entry in an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQAlloc +\end_inset + + +\begin_inset Text + +\layout Standard + + create an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQGet +\end_inset + + +\begin_inset Text + +\layout Standard + + get the next event from an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQFree +\end_inset + + +\begin_inset Text + +\layout Standard + + release the resources for an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQWait +\end_inset + + +\begin_inset Text + +\layout Standard + + wait for a new event in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGet +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a get operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGetId +\end_inset + + +\begin_inset Text + +\layout Standard + + get the id for the current process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDAttach +\end_inset + + +\begin_inset Text + +\layout Standard + + create a memory descriptor and attach it to a match entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDBind +\end_inset + + +\begin_inset Text + +\layout Standard + + create a free-floating memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a memory descriptor from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUpdate +\end_inset + + +\begin_inset Text + +\layout Standard + + update a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEAttach +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a Portal table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a free Portal table entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:attachany} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEInsert +\end_inset + + +\begin_inset Text + +\layout Standard + + create a match entry and insert it in a list +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a match entry from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIDist +\end_inset + + +\begin_inset Text + +\layout Standard + + get the distance to another process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIHandle +\end_inset + + +\begin_inset Text + +\layout Standard + + get the network interface handle for an object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIStatus +\end_inset + + +\begin_inset Text + +\layout Standard + + read a network interface status register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlPut +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a put operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + + summarizes the return codes used by functions defined by the Portals API. + All of these constants are integer values. + The first column of this table gives the symbolic name for the constant, + the second column gives a brief description of the value, and the third + column identifies the functions that can return this value. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Function Return Codes for the Portals 3.2 API +\begin_inset LatexCommand \label{tab:retcodes} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Functions +\series default + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_AC_INV_INDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_DROPPED +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +at least one event has been dropped +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet, PtlWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_EMPTY +\end_inset + + +\begin_inset Text + +\layout Standard + +no events available in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +error during initialization or cleanup +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlInit, PtlFini +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ILL_MD +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +illegal memory descriptor values +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDBind, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_DUP +\end_inset + + +\begin_inset Text + +\layout Standard + +duplicate initialization of an interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_INV +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +initialization of an invalid interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +the ME already has an MD +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ASIZE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table size +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_EQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUpdate, PtlEQFree, PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_HANDLE +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_MD +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid memory descriptor handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUnlink, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ME +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid match entry handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_NI +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid network interface handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PROC +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid process identifier +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PTINDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid Portal table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_REG +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_SR_INDX +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ML_TOOLONG +\end_inset + + +\begin_inset Text + +\layout Standard + +match list too long +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +MD has pending operations +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMDUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOINIT +\end_inset + + +\begin_inset Text + +\layout Standard + +uninitialized API +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\emph default +, except PtlInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOSPACE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +insufficient memory +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOUPDATE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + no update was performed +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_FULL +\end_inset + + +\begin_inset Text + +\layout Standard + +Portal table is full +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_OK +\end_inset + + +\begin_inset Text + +\layout Standard + + success +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SEGV +\end_inset + + +\begin_inset Text + +\layout Standard + +addressing violation +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate, + PtlEQAlloc, PtlEQGet, PtlEQWait +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + summarizes the remaining constant values introduced by the Portals API. + The first column in this table presents the symbolic name for the constant, + the second column gives a brief description of the value, the third column + identifies the type for the value, and the fourth column identifies the + sections in which the value is mentioned. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Other Constants Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:oconsts} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Base type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Intr. +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Ref. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ACK_REQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +request an acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_NONE +\end_inset + + +\begin_inset Text + +\layout Standard + +a NULL event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_START +\end_inset + + +\begin_inset Text + +\layout Standard + +get event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_END +\end_inset + + +\begin_inset Text + +\layout Standard + +get event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +get event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_START +\end_inset + + +\begin_inset Text + +\layout Standard + +put event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_END +\end_inset + + +\begin_inset Text + +\layout Standard + +put event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +put event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_START +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_END +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_START +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_END +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_START +\end_inset + + +\begin_inset Text + +\layout Standard + +send event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_END +\end_inset + + +\begin_inset Text + +\layout Standard + +send event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +send event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +unlink event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for process id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for node id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for user id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_IFACE_DEFAULT +\end_inset + + +\begin_inset Text + +\layout Standard + +default interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_AFTER +\end_inset + + +\begin_inset Text + +\layout Standard + +insert after +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_BEFORE +\end_inset + + +\begin_inset Text + +\layout Standard + +insert before +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_ACK_DISABLE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to disable acknowledgements +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_MANAGE_REMOTE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable the use of remote offsets +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_GET +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable get operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_PUT +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable put operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_THRESH_INF +\end_inset + + +\begin_inset Text + +\layout Standard + +infinite threshold for a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_TRUNCATE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable truncation of a request +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOACK_REQ +\end_inset + + +\begin_inset Text + +\layout Standard + +request no acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_INDEX_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for Portal indexes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_RETAIN +\end_inset + + +\begin_inset Text + +\layout Standard + +disable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SR_DROP_COUNT +\end_inset + + +\begin_inset Text + +\layout Standard + +index for the dropped count register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +enable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Chapter + +The Semantics of Message Transmission +\begin_inset LatexCommand \label{sec:semantics} + +\end_inset + + +\layout Standard + +The portals API uses four types of messages: put requests, acknowledgements, + get requests, and replies. + In this section, we describe the information passed on the wire for each + type of message. + We also describe how this information is used to process incoming messages. +\layout Section + +Sending Messages +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:put-wire} + +\end_inset + + summarizes the information that is transmitted for a put request. + The first column provides a descriptive name for the information, the second + column provides the type for this information, the third column identifies + the source of the information, and the fourth column provides additional + notes. + Most information that is transmitted is obtained directly from the +\emph on +PtlPut +\emph default + operation. + Notice that the handle for the memory descriptor used in the +\emph on +PtlPut +\emph default + operation is transmitted even though this value cannot be interpreted by + the target. + A value of anything other than +\family typewriter +PTL_MD_NONE +\family default +, is interpreted as a request for an acknowledgement. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Put Request +\begin_inset LatexCommand \label{tab:put-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlPut +\emph default + arg +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a put request +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +no ack if +\family typewriter +PTL_MD_NONE +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family roman +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +start +\family default + and +\family typewriter +length +\family default + members +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:ack-wire} + +\end_inset + + summarizes the information transmitted in an acknowledgement. + Most of the information is simply echoed from the put request. + Notice that the initiator and target are obtained directly from the put + request, but are swapped in generating the acknowledgement. + The only new piece of information in the acknowledgement is the manipulated + length which is determined as the put request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in an Acknowledgement +\begin_inset LatexCommand \label{tab:ack-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + length +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:get-wire} + +\end_inset + + summarizes the information that is transmitted for a get request. + Like the information transmitted in a put request, most of the information + transmitted in a get request is obtained directly from the +\emph on +PtlGet +\emph default + operation. + Unlike put requests, get requests do not include the event queue handle. + In this case, the reply is generated whenever the operation succeeds and + the memory descriptor must not be unlinked until the reply is received. + As such, there is no advantage to explicitly sending the event queue handle. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Get Request +\begin_inset LatexCommand \label{tab:get-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlGet +\emph default + argument +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a get operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:reply-wire} + +\end_inset + + summarizes the information transmitted in a reply. + Like an acknowledgement, most of the information is simply echoed from + the get request. + The initiator and target are obtained directly from the get request, but + are swapped in generating the acknowledgement. + The only new information in the acknowledgement are the manipulated length + and the data, which are determined as the get request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Reply +\begin_inset LatexCommand \label{tab:reply-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Section + +Receiving Messages +\begin_inset LatexCommand \label{sec:receiving} + +\end_inset + + +\layout Standard + +When an incoming message arrives on a network interface, the communication + system first checks that the target process identified in the request is + a valid process that has initialized the network interface (i.e., that the + target process has a valid Portal table). + If this test fails, the communication system discards the message and increment +s the dropped message count for the interface. + The remainder of the processing depends on the type of the incoming message. + Put and get messages are subject to access control checks and translation + (searching a match list), while acknowledgement and reply messages bypass + the access control checks and the translation step. +\layout Standard + +Acknowledgement messages include a handle for the memory descriptor used + in the original +\emph on +PtlPut +\emph default + operation. + This memory descriptor will identify the event queue where the event should + be recorded. + Upon receipt of an acknowledgement, the runtime system only needs to confirm + that the memory descriptor and event queue still exist and that there is + space for another event. + Should the any of these conditions fail, the message is simply discarded + and the dropped message count for the interface is incremented. + Otherwise, the system builds an acknowledgement event from the information + in the acknowledgement message and adds it to the event queue. +\layout Standard + +Reception of reply messages is also relatively straightforward. + Each reply message includes a handle for a memory descriptor. + If this descriptor exists, it is used to receive the message. + A reply message will be dropped if the memory descriptor identified in + the request doesn't exist. + In either of this case, the dropped message count for the interface is + incremented. + These are the only reasons for dropping reply messages. + Every memory descriptor accepts and truncates incoming reply messages, + eliminating the other potential reasons for rejecting a reply message. +\layout Standard + +The critical step in processing an incoming put or get request involves + mapping the request to a memory descriptor. + This step starts by using the Portal index in the incoming request to identify + a list of match entries. + This list of match entries is searched in order until a match entry is + found whose match criteria matches the match bits in the incoming request + and whose memory descriptor accepts the request. +\layout Standard + +Because acknowledge and reply messages are generated in response to requests + made by the process receiving these messages, the checks performed by the + runtime system for acknowledgements and replies are minimal. + In contrast, put and get messages are generated by remote processes and + the checks performed for these messages are more extensive. + Incoming put or get messages may be rejected because: +\layout Itemize + +the Portal index supplied in the request is not valid; +\layout Itemize + +the cookie supplied in the request is not a valid access control entry; + +\layout Itemize + +the access control entry identified by the cookie does not match the identifier + of the requesting process; +\layout Itemize + +the access control entry identified by the access control entry does not + match the Portal index supplied in the request; or +\layout Itemize + +the match bits supplied in the request do not match any of the match entries + with a memory descriptor that accepts the request. + +\layout Standard + +In all cases, if the message is rejected, the incoming message is discarded + and the dropped message count for the interface is incremented. +\layout Standard + +A memory descriptor may reject an incoming request for any of the following + reasons: +\layout Itemize + +the +\family typewriter +PTL_MD_PUT +\family default + or +\family typewriter +PTL_MD_GET +\family default + option has not been enabled and the operation is put or get, respectively; + +\layout Itemize + +the length specified in the request is too long for the memory descriptor + and the +\family typewriter +PTL_MD_TRUNCATE +\family default + option has not been enabled. +\layout Chapter + +Examples +\begin_inset LatexCommand \label{sec:examples} + +\end_inset + + +\layout Comment + +The examples presented in this chapter have not been updated to reflect + the current API. +\layout Standard + +In this section we present several example to illustrate expected usage + patterns for the Portals 3.2 API. + The first example describes how to implement parallel servers using the + features of the Portals 3.2 API. + This example covers the access control list and the use of remote managed + offsets. + The second example presents an approach to dealing with dropped requests. + This example covers aspects of match lists and memory descriptors. + The final example covers message reception in MPI. + This example illustrates more sophisticated uses of matching and a procedure + to update a memory descriptor. +\layout Section + +Parallel File Servers +\begin_inset LatexCommand \label{sec:expfs} + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:file} + +\end_inset + + illustrates the logical structure of a parallel file server. + In this case, the parallel server consists of four servers that stripe + application data across four disks. + We would like to present applications with the illusion that the file server + is a single entity. + We will assume that all of the processes that constitute the parallel server + have the same user id. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename file.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 196pt + lyxheight 147pt +\end_inset + + +\layout Caption + +Parallel File Server +\begin_inset LatexCommand \label{fig:file} + +\end_inset + + +\end_inset + + +\layout Standard + +When an application establishes a connection to the parallel file server, + it will allocate a Portal and access control list entry for communicating + with the server. + The access control list entry will include the Portal and match any process + in the parallel file server's, so all of the file server processes will + have access to the portal. + The Portal information and access control entry will be sent to the file + server at this time. + If the application and server need to have multiple, concurrent I/O operations, + they can use additional portals or match entries to keep the operations + from interfering with one another. +\layout Standard + +When an application initiates an I/O operation, it first builds a memory + descriptor that describes the memory region involved in the operation. + This memory descriptor will enable the appropriate operation (put for read + operations and get for write operations) and enable the use of remote offsets + (this lets the servers decide where their data should be placed in the + memory region). + After creating the memory descriptor and linking it into the appropriate + Portal entry, the application sends a read or write request (using +\emph on +PtlPut +\emph default +) to one of the file server processes. + The file server processes can then use put or get operations with the appropria +te offsets to fill or retrieve the contents of the application's buffer. + To know when the operation has completed, the application can add an event + queue to the memory descriptor and add up the lengths of the remote operations + until the sum is the size of the requested I/O operation. +\layout Section + +Dealing with Dropped Requests +\begin_inset LatexCommand \label{sec:exdrop} + +\end_inset + + +\layout Standard + +If a process does not anticipate unexpected requests, they will be discarded. + Applications using the Portals API can query the dropped count for the + interface to determine the number of requests that have been dropped (see + Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + While this approach minimizes resource consumption, it does not provide + information that might be critical in debugging the implementation of a + higher level protocol. +\layout Standard + +To keep track of more information about dropped requests, we use a memory + descriptor that truncates each incoming request to zero bytes and logs + the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + operations in an event queue. + Note that the operations are not dropped in the Portals sense, because + the operation succeeds. +\layout Standard + +The following code fragment illustrates an implementation of this approach. + In this case, we assume that a thread is launched to execute the function + +\family typewriter +watch_drop +\family default +. + This code starts by building an event queue to log truncated operations + and a memory descriptor to truncate the incoming requests. + This example only captures +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests for a single portal. + In a more realistic situation, the memory descriptor would be appended + to the match list for every portal. + We also assume that the thread is capable of keeping up with the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests. + If this is not the case, we could use a finite threshold on the memory + descriptor to capture the first few dropped requests. +\layout LyX-Code + + +\size small +#include +\newline +#include +\newline +#include +\newline + +\newline +#define DROP_SIZE 32 /* number of dropped requests to track */ +\newline + +\newline +int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) { +\newline + ptl_handle_eq_t drop_events; +\newline + ptl_event_t event; +\newline + ptl_handle_md_t drop_em; +\newline + ptl_md_t drop_desc; +\newline + ptl_process_id_t any_proc; +\newline + ptl_handle_me_t match_any; +\newline + +\newline + /* create the event queue */ +\newline + if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the event queue +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* build a match entry */ +\newline + any_proc.nid = PTL_ID_ANY; +\newline + any_proc.pid = PTL_ID_ANY; +\newline + PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN, +\newline + &match_any ); +\newline + +\newline + /* create the memory descriptor */ +\newline + drop_desc.start = NULL; +\newline + drop_desc.length = 0; +\newline + drop_desc.threshold = PTL_MD_THRESH_INF; +\newline + drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE; +\newline + drop_desc.user_ptr = NULL; +\newline + drop_desc.eventq = drop_events; +\newline + if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the memory descriptor +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* watch for "dropped" requests */ +\newline + while( 1 ) { +\newline + if( PtlEQWait( drop_events, &event ) != PTL_OK ) break; +\newline + fprintf( stderr, "Dropped request from gid = event.initiator.gid, + event.initiator.rid ); +\newline + } +\newline +} +\layout Section + +Message Transmission in MPI +\begin_inset LatexCommand \label{sec:exmpi} + +\end_inset + + +\layout Standard + +We conclude this section with a fairly extensive example that describes + an approach to implementing message transmission for MPI. + Like many MPI implementations, we distinguish two message transmission + protocols: a short message protocol and a long message protocol. + We use the constant +\family typewriter +MPI_LONG_LENGTH +\family default + to determine the size of a long message. +\layout Standard + +For small messages, the sender simply sends the message and presumes that + the message will be received (i.e., the receiver has allocated a memory region + to receive the message body). + For large messages, the sender also sends the message, but does not presume + that the message body will be saved. + Instead, the sender builds a memory descriptor for the message and enables + get operations on this descriptor. + If the target does not save the body of the message, it will record an + event for the put operation. + When the process later issues a matching MPI receive, it will perform a + get operation to retrieve the body of the message. +\layout Standard + +To facilitate receive side matching based on the protocol, we use the most + significant bit in the match bits to indicate the protocol: 1 for long + messages and 0 for short messages. +\layout Standard + +The following code presents a function that implements the send side of + the protocol. + The global variable +\family typewriter +EndGet +\family default + is the last match entry attached to the Portal index used for posting long + messages. + This entry does not match any incoming requests (i.e., the memory descriptor + rejects all get operations) and is built during initialization of the MPI + library. + The other global variable, +\family typewriter +MPI_NI +\family default +, is a handle for the network interface used by the MPI implementation. +\layout LyX-Code + + +\size small +extern ptl_handle_me_t EndGet; +\newline +extern ptl_handle_ni_t MPI_NI; +\newline + +\newline +void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq, +\newline + ptl_process_id target, ptl_match_bits_t match ) +\newline +{ +\newline + ptl_handle_md_t send_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_ack_req_t want_ack; +\newline + +\newline + mem_desc.start = buf; +\newline + mem_desc.length = len; +\newline + mem_desc.threshold = 1; +\newline + mem_desc.options = PTL_MD_GET_OP; +\newline + mem_desc.user_ptr = data; +\newline + mem_desc.eventq = eventq; +\newline + +\newline + if( len >= MPI_LONG_LENGTH ) { +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + /* add a match entry to the end of the get list */ +\newline + PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet, + &me_handle ); +\newline + PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL ); +\newline + +\newline + /* we want an ack for long messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a long message + */ +\newline + match |= 1<<63; +\newline + } else { +\newline + /* we don't want an ack for short messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a short message + */ +\newline + match &= ~(1<<63); +\newline + } +\newline + +\newline + /* create a memory descriptor and send it */ +\newline + PtlMDBind( MPI_NI, mem_desc, &send_handle ); +\newline + PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match, + 0 ); +\newline +} +\layout Standard + +The +\emph on +MPISend +\emph default + function returns as soon as the message has been scheduled for transmission. + The event queue argument, +\family typewriter +eventq +\family default +, can be used to determine the disposition of the message. + Assuming that +\family typewriter +eventq +\family default + is not +\family typewriter +PTL_EQ_NONE +\family default +, a +\family typewriter +PTL_EVENT_SENT +\family default + event will be recorded for each message as the message is transmitted. + For small messages, this is the only event that will be recorded in +\family typewriter +eventq +\family default +. + In contrast, long messages include an explicit request for an acknowledgement. + If the +\family typewriter +target +\family default + process has posted a matching receive, the acknowledgement will be sent + as the message is received. + If a matching receive has not been posted, the message will be discarded + and no acknowledgement will be sent. + When the +\family typewriter +target +\family default + process later issues a matching receive, the receive will be translated + into a get operation and a +\family typewriter +PTL_EVENT_GET +\family default + event will be recorded in +\family typewriter +eventq +\family default +. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:mpi} + +\end_inset + + illustrates the organization of the match list used for receiving MPI messages. + The initial entries (not shown in this figure) would be used to match the + MPI receives that have been preposted by the application. + The preposted receives are followed by a match entry, +\emph on +RcvMark +\emph default +, that marks the boundary between preposted receives and the memory descriptors + used for +\begin_inset Quotes eld +\end_inset + +unexpected +\begin_inset Quotes erd +\end_inset + + messages. + The +\emph on +RcvMark +\emph default + entry is followed by a small collection of match entries that match unexpected + +\begin_inset Quotes eld +\end_inset + +short +\begin_inset Quotes erd +\end_inset + + messages, i.e., messages that have a 0 in the most significant bit of their + match bits. + The memory descriptors associated with these match entries will append + the incoming message to the associated memory descriptor and record an + event in an event queue for unexpected messages. + The unexpected short message matching entries are followed by a match entry + that will match messages that were not matched by the preceding match entries, + i.e., the unexpected long messages. + The memory descriptor associated with this match entry truncates the message + body and records an event in the event queue for unexpected messages. + Note that of the memory descriptors used for unexpected messages share + a common event queue. + This makes it possible to process the unexpected messages in the order + in which they arrived, regardless of. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename mpi.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 389pt + lyxheight 284pt +\end_inset + + +\layout Caption + +Message Reception in MPI +\begin_inset LatexCommand \label{fig:mpi} + +\end_inset + + +\end_inset + + +\layout Standard + +When the local MPI process posts an MPI receive, we must first search the + events unexpected message queue to see if a matching message has already + arrived. + If no matching message is found, a match entry for the receive is inserted + before the +\emph on +RcvMark +\emph default + entry--after the match entries for all of the previously posted receives + and before the match entries for the unexpected messages. + This ensures that preposted receives are matched in the order that they + were posted (a requirement of MPI). + +\layout Standard + +While this strategy respects the temporal semantics of MPI, it introduces + a race condition: a matching message might arrive after the events in the + unexpected message queue have been searched, but before the match entry + for the receive has been inserted in the match list. + +\layout Standard + +To avoid this race condition we start by setting the +\family typewriter +threshold +\family default + of the memory descriptor to 0, making the descriptor inactive. + We then insert the match entry into the match list and proceed to search + the events in the unexpected message queue. + A matching message that arrives as we are searching the unexpected message + queue will not be accepted by the memory descriptor and, if not matched + by an earlier match list element, will add an event to the unexpected message + queue. + After searching the events in the unexpected message queue, we update the + memory descriptor, setting the threshold to 1 to activate the memory descriptor. + This update is predicated by the condition that the unexpected message + queue is empty. + We repeat the process of searching the unexpected message queue until the + update succeeds. +\layout Standard + +The following code fragment illustrates this approach. + Because events must be removed from the unexpected message queue to be + examined, this code fragment assumes the existence of a user managed event + list, +\family typewriter +Rcvd +\family default +, for the events that have already been removed from the unexpected message + queue. + In an effort to keep the example focused on the basic protocol, we have + omitted the code that would be needed to manage the memory descriptors + used for unexpected short messages. + In particular, we simply leave messages in these descriptors until they + are received by the application. + In a robust implementation, we would introduce code to ensure that short + unexpected messages are removed from these memory descriptors so that they + can be re-used. +\layout LyX-Code + + +\size small +extern ptl_handle_eq_t UnexpQueue; +\newline +extern ptl_handle_me_t RcvMark; +\newline +extern ptl_handle_me_t ShortMatch; +\newline + +\newline +typedef struct event_list_tag { +\newline + ptl_event_t event; +\newline + struct event_list_tag* next; +\newline +} event_list; +\newline + +\newline +extern event_list Rcvd; +\newline + +\newline +void AppendRcvd( ptl_event_t event ) +\newline +{ +\newline + /* append an event onto the Rcvd list */ +\newline +} +\newline + +\newline +int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi +ts_t match, +\newline + ptl_match_bits_t ignore, ptl_event_t *event ) +\newline +{ +\newline + /* Search the Rcvd event queue, looking for a message that matches the + requested message. +\newline + * If one is found, remove the event from the Rcvd list and return it. + */ +\newline +} +\newline + +\newline +typedef enum { RECEIVED, POSTED } receive_state; +\newline + +\newline +receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event, + ptl_md_t md_buf ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + if( event.rlength >= MPI_LONG_LENGTH ) { +\newline + PtlMDBind( MPI_NI, md_buf, &md_handle ); +\newline + PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX, + md_handle ); +\newline + return POSTED; +\newline + } else { +\newline + /* copy the message */ +\newline + if( event.mlength < *length ) *length = event.mlength; +\newline + memcpy( buf, (char*)event.md_desc.start+event.offset, *length ); +\newline + return RECEIVED; +\newline + } +\newline +} +\newline + +\newline +receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle +_eq_t eventq, +\newline + ptl_process_id_t sender, ptl_match_bits_t match, + ptl_match_bits_t ignore ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_handle_me_t me_handle; +\newline + ptl_event_t event; +\newline + +\newline + /* build a memory descriptor for the receive */ +\newline + md_buf.start = buf; +\newline + md_buf.length = *len; +\newline + md_buf.threshold = 0; /* temporarily disabled */ +\newline + md_buf.options = PTL_MD_PUT_OP; +\newline + md_buf.user_ptr = MPI_data; +\newline + md_buf.eventq = eventq; +\newline + +\newline + /* see if we have already received the message */ +\newline + if( SearchRcvd(buf, len, sender, match, ignore, &event) ) +\newline + return CopyMsg( buf, len, event, md_buf ); +\newline + +\newline + /* create the match entry and attach the memory descriptor */ +\newline + PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark, + &me_handle); +\newline + PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle ); +\newline + +\newline + md_buf.threshold = 1; +\newline + do +\newline + if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) { +\newline + if( MPIMatch(event, match, ignore, sender) ) { +\newline + return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset, + md_buf ); +\newline + } else { +\newline + AppendRcvd( event ); +\newline + } +\newline + } +\newline + while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE + ); +\newline + return POSTED; +\newline +} +\layout Chapter* + +Acknowledgments +\layout Standard + +Several people have contributed to the philosophy, design, and implementation + of the Portals message passing architecture as it has evolved. + We acknowledge the following people for their contributions: Al Audette, + Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike + Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke, + Dave van Dresser, Lee Ward, and Stephen Wheat. + +\layout Standard + + +\begin_inset LatexCommand \BibTeX[ieee]{portals3} + +\end_inset + + +\the_end diff --git a/lnet/doc/put.fig b/lnet/doc/put.fig new file mode 100644 index 0000000..5235b6d --- /dev/null +++ b/lnet/doc/put.fig @@ -0,0 +1,32 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1350 900 2175 1200 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1275 2700 1725 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 1200 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2699 1788 899 1938 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001 +4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001 +4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am new file mode 100644 index 0000000..2cf7f99 --- /dev/null +++ b/lnet/include/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = portals linux +EXTRA_DIST = config.h.in +include $(top_srcdir)/Rules diff --git a/lnet/include/config.h.in b/lnet/include/config.h.in new file mode 100644 index 0000000..b05d0c4 --- /dev/null +++ b/lnet/include/config.h.in @@ -0,0 +1,11 @@ +/* ../include/config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if you have the readline library (-lreadline). */ +#undef HAVE_LIBREADLINE + +/* Name of package */ +#undef PACKAGE + +/* Version number of package */ +#undef VERSION + diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am new file mode 100644 index 0000000..6a65cb5 --- /dev/null +++ b/lnet/include/linux/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(top_srcdir)/Rules + +linuxincludedir = $(includedir)/linux + +linuxinclude_HEADERS=kp30.h portals_lib.h diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h new file mode 100644 index 0000000..4915fe3 --- /dev/null +++ b/lnet/include/linux/kp30.h @@ -0,0 +1,936 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _KP30_INCLUDED +#define _KP30_INCLUDED + + +#define PORTAL_DEBUG + +#ifndef offsetof +# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) +#endif + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +#ifndef CONFIG_SMP +# define smp_processor_id() 0 +#endif + +/* + * Debugging + */ +extern unsigned int portal_subsystem_debug; +extern unsigned int portal_stack; +extern unsigned int portal_debug; +extern unsigned int portal_printk; +/* Debugging subsystems (8 bit ID) + * + * If you add debug subsystem #32, you need to send email to phil, because + * you're going to break kernel subsystem debug filtering. */ +#define S_UNDEFINED (0 << 24) +#define S_MDC (1 << 24) +#define S_MDS (2 << 24) +#define S_OSC (3 << 24) +#define S_OST (4 << 24) +#define S_CLASS (5 << 24) +#define S_OBDFS (6 << 24) /* obsolete */ +#define S_LLITE (7 << 24) +#define S_RPC (8 << 24) +#define S_EXT2OBD (9 << 24) /* obsolete */ +#define S_PORTALS (10 << 24) +#define S_SOCKNAL (11 << 24) +#define S_QSWNAL (12 << 24) +#define S_PINGER (13 << 24) +#define S_FILTER (14 << 24) +#define S_TRACE (15 << 24) /* obsolete */ +#define S_ECHO (16 << 24) +#define S_LDLM (17 << 24) +#define S_LOV (18 << 24) +#define S_GMNAL (19 << 24) +#define S_PTLROUTER (20 << 24) +#define S_COBD (21 << 24) +#define S_PTLBD (22 << 24) +#define S_LOG (23 << 24) + +/* If you change these values, please keep portals/linux/utils/debug.c + * up to date! */ + +/* Debugging masks (24 bits, non-overlapping) */ +#define D_TRACE (1 << 0) /* ENTRY/EXIT markers */ +#define D_INODE (1 << 1) +#define D_SUPER (1 << 2) +#define D_EXT2 (1 << 3) /* anything from ext2_debug */ +#define D_MALLOC (1 << 4) /* print malloc, free information */ +#define D_CACHE (1 << 5) /* cache-related items */ +#define D_INFO (1 << 6) /* general information */ +#define D_IOCTL (1 << 7) /* ioctl related information */ +#define D_BLOCKS (1 << 8) /* ext2 block allocation */ +#define D_NET (1 << 9) /* network communications */ +#define D_WARNING (1 << 10) +#define D_BUFFS (1 << 11) +#define D_OTHER (1 << 12) +#define D_DENTRY (1 << 13) +#define D_PORTALS (1 << 14) /* ENTRY/EXIT markers */ +#define D_PAGE (1 << 15) /* bulk page handling */ +#define D_DLMTRACE (1 << 16) +#define D_ERROR (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA (1 << 19) /* recovery and failover */ +#define D_RPCTRACE (1 << 20) /* for distributed debugging */ +#define D_VFSTRACE (1 << 21) + +#ifndef THREAD_SIZE +#define THREAD_SIZE 8192 +#endif +#ifdef __arch_ia64__ +#define CDEBUG_STACK(var) (&var & (THREAD_SIZE - 1)) +#else +#define CDEBUG_STACK(var) (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0)& \ + (THREAD_SIZE - 1))) +#endif + +#ifdef __KERNEL__ +#define CHECK_STACK(stack) \ + do { \ + if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) \ + portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR, \ + __FILE__, __FUNCTION__, __LINE__, \ + (stack), \ + "maximum lustre stack %u\n", \ + portal_stack = (stack)); \ + } while (0) +#else +#define CHECK_STACK(stack) do{}while(0) +#endif + +#define CDEBUG(mask, format, a...) \ +do { \ + unsigned long stack = CDEBUG_STACK(stack); \ + int match = 0; \ + \ + CHECK_STACK(stack); \ + if (!(mask)) \ + match = 1; \ + else if ((mask) & (D_ERROR | D_EMERG)) \ + match = 1; \ + else if (portal_debug & (mask) && \ + portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24))) \ + match = 1; \ + if (match) \ + portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + stack, format , ## a); \ +} while (0) + +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) + +#define GOTO(label, rc) \ +do { \ + long GOTO__ret = (long)(rc); \ + CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \ + #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\ + (signed long)GOTO__ret); \ + goto label; \ +} while (0) + +#define RETURN(rc) \ +do { \ + typeof(rc) RETURN__ret = (rc); \ + long tmp = (long)RETURN__ret; \ + CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ + (unsigned long)tmp, (signed long)tmp, \ + (signed long)tmp); \ + return RETURN__ret; \ +} while (0) + +#define ENTRY \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ +} while(0) + + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define schedule_work schedule_task +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_TQUEUE((wq), 0, 0); \ + PREPARE_TQUEUE((wq), (cb), (cbdata)); \ +} while (0) + +#define ll_invalidate_inode_pages invalidate_inode_pages +#define PageUptodate Page_Uptodate +#define our_recalc_sigpending(current) recalc_sigpending(current) +#define num_online_cpus() smp_num_cpus +static inline void our_cond_resched(void) +{ + if (current->need_resched) + schedule (); +} + +#else + +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_WORK((wq), (void *)(cb), (void *)(cbdata)); \ +} while (0) +#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping) +#define wait_on_page wait_on_page_locked +#define our_recalc_sigpending(current) recalc_sigpending() +#define strtok(a,b) strpbrk(a, b) +static inline void our_cond_resched(void) +{ + cond_resched(); +} +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ + +#ifdef PORTAL_DEBUG +extern void kportal_assertion_failed(char *expr,char *file,char *func,int line); +#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ + __FUNCTION__, __LINE__)) +#else +#define LASSERT(e) +#endif + +#ifdef __arch_um__ +#define LBUG() \ +do { \ + CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__); \ + panic("LBUG"); \ +} while (0) +#else +#define LBUG() \ +do { \ + CEMERG("LBUG\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__); \ + set_task_state(current, TASK_UNINTERRUPTIBLE); \ + schedule(); \ +} while (0) +#endif /* __arch_um__ */ + +/* + * Memory + */ +#ifdef PORTAL_DEBUG +extern atomic_t portal_kmemory; + +# define portal_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &portal_kmemory); \ +} while (0) + +# define portal_kmem_dec(ptr, size) do { \ + atomic_sub(size, &portal_kmemory); \ +} while (0) + +#else +# define portal_kmem_inc(ptr, size) do {} while (0) +# define portal_kmem_dec(ptr, size) do {} while (0) +#endif /* PORTAL_DEBUG */ + +#define PORTAL_VMALLOC_SIZE 16384 + +#define PORTAL_ALLOC(ptr, size) \ +do { \ + long s = size; \ + LASSERT (!in_interrupt()); \ + if (s > PORTAL_VMALLOC_SIZE) \ + (ptr) = vmalloc(s); \ + else \ + (ptr) = kmalloc(s, GFP_KERNEL); \ + if ((ptr) == NULL) \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s); \ + else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_FREE(ptr, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + if (s > PORTAL_VMALLOC_SIZE) \ + vfree(ptr); \ + else \ + kfree(ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_ALLOC(ptr, slab, size) \ +do { \ + long s = (size); \ + LASSERT (!in_interrupt()); \ + (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' from slab '" #slab "')\n", __FILE__, \ + __LINE__); \ + } else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_FREE(ptr, slab, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + memset((ptr), 0x5a, s); \ + kmem_cache_free((slab), ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +/* ------------------------------------------------------------------- */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + +#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x) +#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x) + +#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x)) +#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) + +#define PORTAL_MODULE_USE MOD_INC_USE_COUNT +#define PORTAL_MODULE_UNUSE MOD_DEC_USE_COUNT +#else + +#define PORTAL_SYMBOL_REGISTER(x) +#define PORTAL_SYMBOL_UNREGISTER(x) + +#define PORTAL_SYMBOL_GET(x) symbol_get(x) +#define PORTAL_SYMBOL_PUT(x) symbol_put(x) + +#define PORTAL_MODULE_USE try_module_get(THIS_MODULE) +#define PORTAL_MODULE_UNUSE module_put(THIS_MODULE) + +#endif + +/******************************************************************************/ +/* Kernel Portals Router interface */ + +typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback + +/* space for routing targets to stash "stuff" in a forwarded packet */ +typedef union { + long long _alignment; + void *_space[16]; /* scale with CPU arch */ +} kprfd_scratch_t; + +/* Kernel Portals Routing Forwarded message Descriptor */ +typedef struct { + struct list_head kprfd_list; /* stash in queues (routing target can use) */ + ptl_nid_t kprfd_target_nid; /* final destination NID */ + ptl_nid_t kprfd_gateway_nid; /* gateway NID */ + int kprfd_nob; /* # message bytes (including header) */ + int kprfd_niov; /* # message frags (including header) */ + struct iovec *kprfd_iov; /* message fragments */ + void *kprfd_router_arg; // originating NAL's router arg + kpr_fwd_callback_t kprfd_callback; /* completion callback */ + void *kprfd_callback_arg; /* completion callback arg */ + kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets +} kpr_fwd_desc_t; + +typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); + +/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ +typedef const struct { + int kprni_nalid; /* NAL's id */ + void *kprni_arg; /* Arg to pass when calling into NAL */ + kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ +} kpr_nal_interface_t; + +/* Router's routing interface (Kernel Portals Routing Router Interface) */ +typedef const struct { + /* register the calling NAL with the router and get back the handle for + * subsequent calls */ + int (*kprri_register) (kpr_nal_interface_t *nal_interface, + void **router_arg); + + /* ask the router to find a gateway that forwards to 'nid' and is a peer + * of the calling NAL */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + ptl_nid_t *gateway_nid); + + /* hand a packet over to the router for forwarding */ + kpr_fwd_t kprri_fwd_start; + + /* hand a packet back to the router for completion */ + void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, + int error); + + /* the calling NAL is shutting down */ + void (*kprri_shutdown) (void *router_arg); + + /* deregister the calling NAL with the router */ + void (*kprri_deregister) (void *router_arg); + +} kpr_router_interface_t; + +/* Convenient struct for NAL to stash router interface/args */ +typedef struct { + kpr_router_interface_t *kpr_interface; + void *kpr_arg; +} kpr_router_t; + +/* Router's control interface (Kernel Portals Routing Control Interface) */ +typedef const struct { + int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); + int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_get_route)(int index, int *gateway_nal, + ptl_nid_t *gateway, ptl_nid_t *lo_nid, + ptl_nid_t *hi_nid); +} kpr_control_interface_t; + +extern kpr_control_interface_t kpr_control_interface; +extern kpr_router_interface_t kpr_router_interface; + +static inline int +kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) +{ + int rc; + + router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); + if (router->kpr_interface == NULL) + return (-ENOENT); + + rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); + if (rc != 0) + router->kpr_interface = NULL; + + PORTAL_SYMBOL_PUT (kpr_router_interface); + return (rc); +} + +static inline int +kpr_routing (kpr_router_t *router) +{ + return (router->kpr_interface != NULL); +} + +static inline int +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +{ + if (!kpr_routing (router)) + return (-EHOSTUNREACH); + + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + gateway_nid)); +} + +static inline void +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, + int nob, int niov, struct iovec *iov, + kpr_fwd_callback_t callback, void *callback_arg) +{ + fwd->kprfd_target_nid = nid; + fwd->kprfd_gateway_nid = nid; + fwd->kprfd_nob = nob; + fwd->kprfd_niov = niov; + fwd->kprfd_iov = iov; + fwd->kprfd_callback = callback; + fwd->kprfd_callback_arg = callback_arg; +} + +static inline void +kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) +{ + if (!kpr_routing (router)) + fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + else + router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); +} + +static inline void +kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) +{ + LASSERT (kpr_routing (router)); + router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); +} + +static inline void +kpr_shutdown (kpr_router_t *router) +{ + if (kpr_routing (router)) + router->kpr_interface->kprri_shutdown (router->kpr_arg); +} + +static inline void +kpr_deregister (kpr_router_t *router) +{ + if (!kpr_routing (router)) + return; + router->kpr_interface->kprri_deregister (router->kpr_arg); + router->kpr_interface = NULL; +} + +/******************************************************************************/ + +#ifdef PORTALS_PROFILING +#define prof_enum(FOO) PROF__##FOO +enum { + prof_enum(our_recvmsg), + prof_enum(our_sendmsg), + prof_enum(socknal_recv), + prof_enum(lib_parse), + prof_enum(conn_list_walk), + prof_enum(memcpy), + prof_enum(lib_finalize), + prof_enum(pingcli_time), + prof_enum(gmnal_send), + prof_enum(gmnal_recv), + MAX_PROFS +}; + +struct prof_ent { + char *str; + /* hrmph. wrap-tastic. */ + u32 starts; + u32 finishes; + cycles_t total_cycles; + cycles_t start; + cycles_t end; +}; + +extern struct prof_ent prof_ents[MAX_PROFS]; + +#define PROF_START(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->starts++; \ + pe->start = get_cycles(); \ + } while (0) + +#define PROF_FINISH(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->finishes++; \ + pe->end = get_cycles(); \ + pe->total_cycles += (pe->end - pe->start); \ + } while (0) +#else /* !PORTALS_PROFILING */ +#define PROF_START(FOO) do {} while(0) +#define PROF_FINISH(FOO) do {} while(0) +#endif /* PORTALS_PROFILING */ + +/* debug.c */ +void portals_run_lbug_upcall(char * file, char *fn, int line); +void portals_debug_dumplog(void); +int portals_debug_init(unsigned long bufsize); +int portals_debug_cleanup(void); +int portals_debug_clear_buffer(void); +int portals_debug_mark_buffer(char *text); +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *file, unsigned int size); +__s32 portals_debug_copy_to_user(char *buf, unsigned long len); +#if (__GNUC__) +/* Use the special GNU C __attribute__ hack to have the compiler check the + * printf style argument string against the actual argument count and + * types. + */ +#ifdef printf +# warning printf has been defined as a macro... +# undef printf +#endif +void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) + __attribute__ ((format (printf, 7, 8))); +#else +void portals_debug_msg (int subsys, int mask, char *file, char *fn, + int line, unsigned long stack, + const char *format, ...); +#endif /* __GNUC__ */ +void portals_debug_set_level(unsigned int debug_level); + +# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) +# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) +# define time(a) CURRENT_TIME + +extern void kportal_daemonize (char *name); +extern void kportal_blockallsigs (void); + +#else /* !__KERNEL__ */ +# include +# include +#ifndef __CYGWIN__ +# include +#endif +# include +# include +# include +# ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +# endif +# ifdef PORTAL_DEBUG +# undef NDEBUG +# include +# define LASSERT(e) assert(e) +# else +# define LASSERT(e) +# endif +# define printk(format, args...) printf (format, ## args) +# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); +# define PORTAL_FREE(a, b) do { free(a); } while (0); +# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ + printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ + (subsys) >> 24, (mask), (long)time(0), file, fn, line, \ + getpid() , stack, ## a); +#endif + +#ifndef CURRENT_TIME +# define CURRENT_TIME time(0) +#endif + +#include + +/* + * USER LEVEL STUFF BELOW + */ + +#define PORTAL_IOCTL_VERSION 0x00010007 +#define PING_SYNC 0 +#define PING_ASYNC 1 + +struct portal_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + __u64 ioc_nid; + __u64 ioc_nid2; + __u64 ioc_nid3; + __u32 ioc_count; + __u32 ioc_nal; + __u32 ioc_nal_cmd; + __u32 ioc_fd; + __u32 ioc_id; + + __u32 ioc_flags; + __u32 ioc_size; + + __u32 ioc_wait; + __u32 ioc_timeout; + __u32 ioc_misc; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +struct portal_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct portals_debug_ioctl_data +{ + struct portal_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define PORTAL_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = PORTAL_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME check conflict with lustre_lib.h */ +#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) + +static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) +{ + int len = sizeof(*data); + len += size_round(data->ioc_inllen1); + len += size_round(data->ioc_inllen2); + return len; +} + +static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if (portal_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("PORTALS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + +#ifndef __KERNEL__ +static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, + int max) +{ + char *ptr; + struct portal_ioctl_data *overlay; + data->ioc_len = portal_ioctl_packlen(data); + data->ioc_version = PORTAL_IOCTL_VERSION; + + if (*pbuf && portal_ioctl_packlen(data) > max) + return 1; + if (*pbuf == NULL) { + *pbuf = malloc(data->ioc_len); + } + if (!*pbuf) + return 1; + overlay = (struct portal_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + if (data->ioc_inlbuf2) + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + if (portal_ioctl_is_invalid(overlay)) + return 1; + + return 0; +} +#else +#include + +/* buffer MUST be at least the size of portal_ioctl_hdr */ +static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct portal_ioctl_hdr *hdr; + struct portal_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct portal_ioctl_hdr *)buf; + data = (struct portal_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if ( err ) { + EXIT; + return err; + } + + if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { + CERROR ("PORTALS: version mismatch kernel vs application\n"); + return -EINVAL; + } + + if (hdr->ioc_len + buf >= end) { + CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); + return -EINVAL; + } + + + if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { + CERROR ("PORTALS: user buffer too small for ioctl\n"); + return -EINVAL; + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if ( err ) { + EXIT; + return err; + } + + if (portal_ioctl_is_invalid(data)) { + CERROR ("PORTALS: ioctl not correctly formatted\n"); + return -EINVAL; + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); + } + + EXIT; + return 0; +} +#endif + +/* ioctls for manipulating snapshots 30- */ +#define IOC_PORTAL_TYPE 'e' +#define IOC_PORTAL_MIN_NR 30 + +#define IOC_PORTAL_PING _IOWR('e', 30, long) +#define IOC_PORTAL_GET_DEBUG _IOWR('e', 31, long) +#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long) +#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long) +#define IOC_PORTAL_PANIC _IOWR('e', 34, long) +#define IOC_PORTAL_ADD_ROUTE _IOWR('e', 35, long) +#define IOC_PORTAL_DEL_ROUTE _IOWR('e', 36, long) +#define IOC_PORTAL_GET_ROUTE _IOWR('e', 37, long) +#define IOC_PORTAL_NAL_CMD _IOWR('e', 38, long) +#define IOC_PORTAL_GET_NID _IOWR('e', 39, long) +#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) +#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) + +#define IOC_PORTAL_MAX_NR 41 + +enum { + QSWNAL = 1, + SOCKNAL, + GMNAL, + TOENAL, + TCPNAL, + SCIMACNAL, + NAL_ENUM_END_MARKER +}; + +#ifdef __KERNEL__ +extern ptl_handle_ni_t kqswnal_ni; +extern ptl_handle_ni_t ksocknal_ni; +extern ptl_handle_ni_t ktoenal_ni; +extern ptl_handle_ni_t kgmnal_ni; +extern ptl_handle_ni_t kscimacnal_ni; +#endif + +#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) + +#define NAL_CMD_REGISTER_PEER_FD 100 +#define NAL_CMD_CLOSE_CONNECTION 101 +#define NAL_CMD_REGISTER_MYNID 102 +#define NAL_CMD_PUSH_CONNECTION 103 + +enum { + DEBUG_DAEMON_START = 1, + DEBUG_DAEMON_STOP = 2, + DEBUG_DAEMON_PAUSE = 3, + DEBUG_DAEMON_CONTINUE = 4, +}; + +/* XXX remove to lustre ASAP */ +struct lustre_peer { + ptl_nid_t peer_nid; + ptl_handle_ni_t peer_ni; +}; + +/* module.c */ +typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private); +int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private); +int kportal_nal_unregister(int nal); + +ptl_handle_ni_t *kportal_get_ni (int nal); +void kportal_put_ni (int nal); + +#ifdef __CYGWIN__ +#ifndef BITS_PER_LONG +#if (~0UL) == 0xffffffffUL +#define BITS_PER_LONG 32 +#else +#define BITS_PER_LONG 64 +#endif +#endif +#endif + +#if (BITS_PER_LONG == 32 || __WORDSIZE == 32) +# define LPU64 "%Lu" +# define LPD64 "%Ld" +# define LPX64 "%#Lx" +# define LPSZ "%u" +# define LPSSZ "%d" +#endif +#if (BITS_PER_LONG == 64 || __WORDSIZE == 64) +# define LPU64 "%lu" +# define LPD64 "%ld" +# define LPX64 "%#lx" +# define LPSZ "%lu" +# define LPSSZ "%ld" +#endif +#ifndef LPU64 +# error "No word size defined" +#endif + +#endif diff --git a/lnet/include/linux/portals_lib.h b/lnet/include/linux/portals_lib.h new file mode 100644 index 0000000..a528a80 --- /dev/null +++ b/lnet/include/linux/portals_lib.h @@ -0,0 +1,188 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _PORTALS_LIB_H +#define _PORTALS_LIB_H + +#ifndef __KERNEL__ +# include +#else +# include +#endif + +#undef MIN +#define MIN(a,b) (((a)<(b)) ? (a): (b)) +#undef MAX +#define MAX(a,b) (((a)>(b)) ? (a): (b)) +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int size_round (int val) +{ + return (val + 7) & (~0x7); +} + +static inline int size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t round_strlen(char *fset) +{ + return size_round(strlen(fset) + 1); +} + +#ifdef __KERNEL__ +static inline char *strdup(const char *str) +{ + int len = strlen(str) + 1; + char *tmp = kmalloc(len, GFP_KERNEL); + if (tmp) + memcpy(tmp, str, len); + + return tmp; +} +#endif + +#ifdef __KERNEL__ +# define NTOH__u32(var) le32_to_cpu(var) +# define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u32(var) cpu_to_le32(var) +# define HTON__u64(var) cpu_to_le64(var) +#else +# define expansion_u64(var) \ + ({ __u64 ret; \ + switch (sizeof(var)) { \ + case 8: (ret) = (var); break; \ + case 4: (ret) = (__u32)(var); break; \ + case 2: (ret) = (__u16)(var); break; \ + case 1: (ret) = (__u8)(var); break; \ + }; \ + (ret); \ + }) +# define NTOH__u32(var) (var) +# define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u32(var) (var) +# define HTON__u64(var) (expansion_u64(var)) +#endif + +/* + * copy sizeof(type) bytes from pointer to var and move ptr forward. + * return EFAULT if pointer goes beyond end + */ +#define UNLOGV(var,type,ptr,end) \ +do { \ + var = *(type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* the following two macros convert to little endian */ +/* type MUST be __u32 or __u64 */ +#define LUNLOGV(var,type,ptr,end) \ +do { \ + var = NTOH##type(*(type *)ptr); \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* now log values */ +#define LOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = var; \ + ptr += sizeof(type); \ +} while (0) + +/* and in network order */ +#define LLOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = HTON##type(var); \ + ptr += sizeof(type); \ +} while (0) + + +/* + * set var to point at (type *)ptr, move ptr forward with sizeof(type) + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGP(var,type,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define LOGP(var,type,ptr) \ +do { \ + memcpy(ptr, var, sizeof(type)); \ + ptr += sizeof(type); \ +} while (0) + +/* + * set var to point at (char *)ptr, move ptr forward by size_round(len); + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGL(var,type,len,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += size_round(len * sizeof(type)); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define UNLOGL0(var,type,len,ptr,end) \ +do { \ + UNLOGL(var,type,len,ptr,end); \ + if ( *((char *)ptr - size_round(len) + len - 1) != '\0') \ + return -EFAULT; \ +} while (0) + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += size_round(len + 1); \ +} while (0) + +#endif /* _PORTALS_LIB_H */ diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am new file mode 100644 index 0000000..c61b084 --- /dev/null +++ b/lnet/include/lnet/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = base +include $(top_srcdir)/Rules + +pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h + diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h new file mode 100644 index 0000000..af4a2dc --- /dev/null +++ b/lnet/include/lnet/api-support.h @@ -0,0 +1,27 @@ +# define DEBUG_SUBSYSTEM S_PORTALS +# define PORTAL_DEBUG + +#ifndef __KERNEL__ +# include +# include +# include +# include + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include +# include +# include +#endif + +#include +#include +#include + +#include +#include +#include + +/* Hack for 2.4.18 macro name collision */ +#ifdef yield +#undef yield +#endif diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h new file mode 100644 index 0000000..a83749b --- /dev/null +++ b/lnet/include/lnet/api.h @@ -0,0 +1,159 @@ +#ifndef P30_API_H +#define P30_API_H + +#include + +#ifndef PTL_NO_WRAP +int PtlInit(void); +int PtlInitialized(void); +void PtlFini(void); + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in, + ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid, + ptl_handle_ni_t * interface_out); + +int PtlNIInitialized(ptl_interface_t); + +int PtlNIFini(ptl_handle_ni_t interface_in); + +#endif + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); + + +/* + * Network interfaces + */ + +#ifndef PTL_NO_WRAP +int PtlNIBarrier(ptl_handle_ni_t interface_in); +#endif + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out); + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out); + +#ifndef PTL_NO_WRAP +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); +#endif + + +/* + * PtlNIDebug: + * + * This is not an official Portals 3 API call. It is provided + * by the reference implementation to allow the maintainers an + * easy way to turn on and off debugging information in the + * library. Do not use it in code that is not intended for use + * with any version other than the portable reference library. + */ +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); + +/* + * PtlNIFailNid + * + * Not an official Portals 3 API call. It provides a way of simulating + * communications failures to all (nid == PTL_NID_ANY), or specific peers + * (via multiple calls), either until further notice (threshold == -1), or + * for a specific number of messages. Passing a threshold of zero, "heals" + * the given peer. + */ +int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); + + +/* + * Match entries + */ + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out); + +int PtlMEUnlink(ptl_handle_me_t current_in); + +int PtlMEUnlinkList(ptl_handle_me_t current_in); + +int PtlTblDump(ptl_handle_ni_t ni, int index_in); +int PtlMEDump(ptl_handle_me_t current_in); + + + +/* + * Memory descriptors + */ + +#ifndef PTL_NO_WRAP +int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out); + +int PtlMDUnlink(ptl_handle_md_t md_in); + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in); + +#endif + +/* These should not be called by users */ +int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in, + ptl_seq_t sequence_in); + + + + +/* + * Event queues + */ +#ifndef PTL_NO_WRAP + +/* These should be called by users */ +int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out); +int PtlEQFree(ptl_handle_eq_t eventq_in); + +int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); + +int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout); +#endif + +/* + * Access Control Table + */ +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); + + +/* + * Data movement + */ + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in); + + + +#endif diff --git a/lnet/include/lnet/arg-blocks.h b/lnet/include/lnet/arg-blocks.h new file mode 100644 index 0000000..3c3b154 --- /dev/null +++ b/lnet/include/lnet/arg-blocks.h @@ -0,0 +1,265 @@ +#ifndef PTL_BLOCKS_H +#define PTL_BLOCKS_H + +/* + * blocks.h + * + * Argument block types for the Portals 3.0 library + * Generated by idl + * + */ + +#include + +/* put LIB_MAX_DISPATCH last here -- these must match the + assignements to the dispatch table in lib-p30/dispatch.c */ +#define PTL_GETID 1 +#define PTL_NISTATUS 2 +#define PTL_NIDIST 3 +#define PTL_NIDEBUG 4 +#define PTL_MEATTACH 5 +#define PTL_MEINSERT 6 +// #define PTL_MEPREPEND 7 +#define PTL_MEUNLINK 8 +#define PTL_TBLDUMP 9 +#define PTL_MEDUMP 10 +#define PTL_MDATTACH 11 +// #define PTL_MDINSERT 12 +#define PTL_MDBIND 13 +#define PTL_MDUPDATE 14 +#define PTL_MDUNLINK 15 +#define PTL_EQALLOC 16 +#define PTL_EQFREE 17 +#define PTL_ACENTRY 18 +#define PTL_PUT 19 +#define PTL_GET 20 +#define PTL_FAILNID 21 +#define LIB_MAX_DISPATCH 21 + +typedef struct PtlFailNid_in { + ptl_handle_ni_t interface; + ptl_nid_t nid; + unsigned int threshold; +} PtlFailNid_in; + +typedef struct PtlFailNid_out { + int rc; +} PtlFailNid_out; + +typedef struct PtlGetId_in { + ptl_handle_ni_t handle_in; +} PtlGetId_in; + +typedef struct PtlGetId_out { + int rc; + ptl_process_id_t id_out; +} PtlGetId_out; + +typedef struct PtlNIStatus_in { + ptl_handle_ni_t interface_in; + ptl_sr_index_t register_in; +} PtlNIStatus_in; + +typedef struct PtlNIStatus_out { + int rc; + ptl_sr_value_t status_out; +} PtlNIStatus_out; + + +typedef struct PtlNIDist_in { + ptl_handle_ni_t interface_in; + ptl_process_id_t process_in; +} PtlNIDist_in; + +typedef struct PtlNIDist_out { + int rc; + unsigned long distance_out; +} PtlNIDist_out; + + +typedef struct PtlNIDebug_in { + unsigned int mask_in; +} PtlNIDebug_in; + +typedef struct PtlNIDebug_out { + unsigned int rc; +} PtlNIDebug_out; + + +typedef struct PtlMEAttach_in { + ptl_handle_ni_t interface_in; + ptl_pt_index_t index_in; + ptl_ins_pos_t position_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; +} PtlMEAttach_in; + +typedef struct PtlMEAttach_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEAttach_out; + + +typedef struct PtlMEInsert_in { + ptl_handle_me_t current_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; + ptl_ins_pos_t position_in; +} PtlMEInsert_in; + +typedef struct PtlMEInsert_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEInsert_out; + +typedef struct PtlMEUnlink_in { + ptl_handle_me_t current_in; + ptl_unlink_t unlink_in; +} PtlMEUnlink_in; + +typedef struct PtlMEUnlink_out { + int rc; +} PtlMEUnlink_out; + + +typedef struct PtlTblDump_in { + int index_in; +} PtlTblDump_in; + +typedef struct PtlTblDump_out { + int rc; +} PtlTblDump_out; + + +typedef struct PtlMEDump_in { + ptl_handle_me_t current_in; +} PtlMEDump_in; + +typedef struct PtlMEDump_out { + int rc; +} PtlMEDump_out; + + +typedef struct PtlMDAttach_in { + ptl_handle_me_t me_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; + ptl_unlink_t unlink_in; +} PtlMDAttach_in; + +typedef struct PtlMDAttach_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDAttach_out; + + +typedef struct PtlMDBind_in { + ptl_handle_ni_t ni_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; +} PtlMDBind_in; + +typedef struct PtlMDBind_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDBind_out; + + +typedef struct PtlMDUpdate_internal_in { + ptl_handle_md_t md_in; + ptl_handle_eq_t testq_in; + ptl_seq_t sequence_in; + + ptl_md_t old_inout; + int old_inout_valid; + ptl_md_t new_inout; + int new_inout_valid; +} PtlMDUpdate_internal_in; + +typedef struct PtlMDUpdate_internal_out { + int rc; + ptl_md_t old_inout; + ptl_md_t new_inout; +} PtlMDUpdate_internal_out; + + +typedef struct PtlMDUnlink_in { + ptl_handle_md_t md_in; +} PtlMDUnlink_in; + +typedef struct PtlMDUnlink_out { + int rc; + ptl_md_t status_out; +} PtlMDUnlink_out; + + +typedef struct PtlEQAlloc_in { + ptl_handle_ni_t ni_in; + ptl_size_t count_in; + void *base_in; + int len_in; + int (*callback_in) (ptl_event_t * event); +} PtlEQAlloc_in; + +typedef struct PtlEQAlloc_out { + int rc; + ptl_handle_eq_t handle_out; +} PtlEQAlloc_out; + + +typedef struct PtlEQFree_in { + ptl_handle_eq_t eventq_in; +} PtlEQFree_in; + +typedef struct PtlEQFree_out { + int rc; +} PtlEQFree_out; + + +typedef struct PtlACEntry_in { + ptl_handle_ni_t ni_in; + ptl_ac_index_t index_in; + ptl_process_id_t match_id_in; + ptl_pt_index_t portal_in; +} PtlACEntry_in; + +typedef struct PtlACEntry_out { + int rc; +} PtlACEntry_out; + + +typedef struct PtlPut_in { + ptl_handle_md_t md_in; + ptl_ack_req_t ack_req_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; + ptl_hdr_data_t hdr_data_in; +} PtlPut_in; + +typedef struct PtlPut_out { + int rc; +} PtlPut_out; + + +typedef struct PtlGet_in { + ptl_handle_md_t md_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; +} PtlGet_in; + +typedef struct PtlGet_out { + int rc; +} PtlGet_out; + + +#endif diff --git a/lnet/include/lnet/defines.h b/lnet/include/lnet/defines.h new file mode 100644 index 0000000..285f7e0 --- /dev/null +++ b/lnet/include/lnet/defines.h @@ -0,0 +1,117 @@ +/* +** $Id: defines.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +** +** This files contains definitions that are used throughout the cplant code. +*/ + +#ifndef CPLANT_H +#define CPLANT_H + +#define TITLE(fname,zmig) + + +/* +** TRUE and FALSE +*/ +#undef TRUE +#define TRUE (1) +#undef FALSE +#define FALSE (0) + + +/* +** Return codes from functions +*/ +#undef OK +#define OK (0) +#undef ERROR +#define ERROR (-1) + + + +/* +** The GCC macro for a safe max() that works on all types arithmetic types. +*/ +#ifndef MAX +#define MAX(a, b) (a) > (b) ? (a) : (b) +#endif /* MAX */ + +#ifndef MIN +#define MIN(a, b) (a) < (b) ? (a) : (b) +#endif /* MIN */ + +/* +** The rest is from the old qkdefs.h +*/ + +#ifndef __linux__ +#define __inline__ +#endif + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef __osf__ +#define PRIVATE static +#define PUBLIC +#endif + +#ifndef __osf__ +typedef unsigned char uchar; +#endif + +typedef char CHAR; +typedef unsigned char UCHAR; +typedef char INT8; +typedef unsigned char UINT8; +typedef short int INT16; +typedef unsigned short int UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long LONG32; +typedef unsigned long ULONG32; + +/* long may be 32 or 64, so we can't really append the size to the definition */ +typedef long LONG; +typedef unsigned long ULONG; + +#ifdef __alpha__ +typedef long int_t; +#ifndef __osf__ +typedef unsigned long uint_t; +#endif +#endif + +#ifdef __i386__ +typedef int int_t; +typedef unsigned int uint_t; +#endif + +typedef float FLOAT32; +typedef double FLOAT64; +typedef void VOID; +typedef INT32 BOOLEAN; +typedef void (*FCN_PTR)(void); + +#ifndef off64_t + +#if defined (__alpha__) || defined (__ia64__) +typedef long off64_t; +#else +typedef long long off64_t; +#endif + +#endif + +/* +** Process related typedefs +*/ +typedef UINT16 PID_TYPE; /* Type of Local process ID */ +typedef UINT16 NID_TYPE; /* Type of Physical node ID */ +typedef UINT16 GID_TYPE; /* Type of Group ID */ +typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */ + + + +#endif /* CPLANT_H */ diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h new file mode 100644 index 0000000..817936a --- /dev/null +++ b/lnet/include/lnet/errno.h @@ -0,0 +1,61 @@ +#ifndef _P30_ERRNO_H_ +#define _P30_ERRNO_H_ + +/* + * include/portals/errno.h + * + * Shared error number lists + */ + +/* If you change these, you must update the string table in api-errno.c */ +typedef enum { + PTL_OK = 0, + PTL_SEGV = 1, + + PTL_NOSPACE = 2, + PTL_INUSE = 3, + PTL_VAL_FAILED = 4, + + PTL_NAL_FAILED = 5, + PTL_NOINIT = 6, + PTL_INIT_DUP = 7, + PTL_INIT_INV = 8, + PTL_AC_INV_INDEX = 9, + + PTL_INV_ASIZE = 10, + PTL_INV_HANDLE = 11, + PTL_INV_MD = 12, + PTL_INV_ME = 13, + PTL_INV_NI = 14, +/* If you change these, you must update the string table in api-errno.c */ + PTL_ILL_MD = 15, + PTL_INV_PROC = 16, + PTL_INV_PSIZE = 17, + PTL_INV_PTINDEX = 18, + PTL_INV_REG = 19, + + PTL_INV_SR_INDX = 20, + PTL_ML_TOOLONG = 21, + PTL_ADDR_UNKNOWN = 22, + PTL_INV_EQ = 23, + PTL_EQ_DROPPED = 24, + + PTL_EQ_EMPTY = 25, + PTL_NOUPDATE = 26, + PTL_FAIL = 27, + PTL_NOT_IMPLEMENTED = 28, + PTL_NO_ACK = 29, + + PTL_IOV_TOO_MANY = 30, + PTL_IOV_TOO_SMALL = 31, + + PTL_EQ_INUSE = 32, + PTL_MD_INUSE = 33, + + PTL_MAX_ERRNO = 33 +} ptl_err_t; +/* If you change these, you must update the string table in api-errno.c */ + +extern const char *ptl_err_str[]; + +#endif diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h new file mode 100644 index 0000000..8ade444 --- /dev/null +++ b/lnet/include/lnet/internal.h @@ -0,0 +1,46 @@ +/* +** $Id: internal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ +#ifndef _P30_INTERNAL_H_ +#define _P30_INTERNAL_H_ + +/* + * p30/internal.h + * + * Internals for the API level library that are not needed + * by the user application + */ + +#include + +extern int ptl_init; /* Has the library be initialized */ + +extern int ptl_ni_init(void); +extern int ptl_me_init(void); +extern int ptl_md_init(void); +extern int ptl_eq_init(void); + +extern int ptl_me_ni_init(nal_t * nal); +extern int ptl_md_ni_init(nal_t * nal); +extern int ptl_eq_ni_init(nal_t * nal); + +extern void ptl_ni_fini(void); +extern void ptl_me_fini(void); +extern void ptl_md_fini(void); +extern void ptl_eq_fini(void); + +extern void ptl_me_ni_fini(nal_t * nal); +extern void ptl_md_ni_fini(nal_t * nal); +extern void ptl_eq_ni_fini(nal_t * nal); + +static inline ptl_eq_t * +ptl_handle2usereq (ptl_handle_eq_t *handle) +{ + /* EQ handles are a little wierd. On the "user" side, the cookie + * is just a pointer to a queue of events in shared memory. It's + * cb_eq_handle is the "real" handle which we pass when we + * call do_forward(). */ + return (ptl_eq_t *)((unsigned long)handle->cookie); +} + +#endif diff --git a/lnet/include/lnet/lib-dispatch.h b/lnet/include/lnet/lib-dispatch.h new file mode 100644 index 0000000..7e5d73d --- /dev/null +++ b/lnet/include/lnet/lib-dispatch.h @@ -0,0 +1,46 @@ +#ifndef PTL_DISPATCH_H +#define PTL_DISPATCH_H + +/* + * include/dispatch.h + * + * Dispatch table header and externs for remote side + * operations + * + * Generated by idl + * + */ + +#include +#include + +extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlACEntry(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); + +extern char *dispatch_name(int index); +#endif diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h new file mode 100644 index 0000000..ec3393b --- /dev/null +++ b/lnet/include/lnet/lib-lnet.h @@ -0,0 +1,383 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h new file mode 100644 index 0000000..4052c0c --- /dev/null +++ b/lnet/include/lnet/lib-nal.h @@ -0,0 +1,102 @@ +#ifndef _LIB_NAL_H_ +#define _LIB_NAL_H_ + +/* + * nal.h + * + * Library side headers that define the abstraction layer's + * responsibilities and interfaces + */ + +#include + +struct nal_cb_t { + /* + * Per interface portal table, access control table + * and NAL private data field; + */ + lib_ni_t ni; + void *nal_data; + /* + * send: Sends a preformatted header and user data to a + * specified remote process. + * Can overwrite iov. + */ + int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, size_t mlen); + /* + * recv: Receives an incoming message from a remote process + * Type of iov depends on options. Can overwrite iov. + */ + int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, size_t mlen, + size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, size_t mlen, + size_t rlen); + /* + * read: Reads a block of data from a specified user address + */ + int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); + + /* + * write: Writes a block of data into a specified user address + */ + int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); + + /* + * callback: Calls an event callback + */ + int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); + + /* + * malloc: Acquire a block of memory in a system independent + * fashion. + */ + void *(*cb_malloc) (nal_cb_t * nal, size_t len); + + void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to cb_unmap() is what cb_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); + + /* Turn interrupts off (begin of protected area) */ + void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); + + /* Turn interrupts on (end of protected area) */ + void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); + + /* + * Calculate a network "distance" to given node + */ + int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); +}; + +#endif diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h new file mode 100644 index 0000000..ec3393b --- /dev/null +++ b/lnet/include/lnet/lib-p30.h @@ -0,0 +1,383 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h new file mode 100644 index 0000000..08ea118 --- /dev/null +++ b/lnet/include/lnet/lib-types.h @@ -0,0 +1,273 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * p30/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef _LIB_TYPES_H_ +#define _LIB_TYPES_H_ + +#include +#ifdef __KERNEL__ +# define PTL_USE_SLAB_CACHE +# include +# include +# include +#else +# include +#endif + +/* struct nal_cb_t is defined in lib-nal.h */ +typedef struct nal_cb_t nal_cb_t; + +typedef char *user_ptr; +typedef struct lib_msg_t lib_msg_t; +typedef struct lib_ptl_t lib_ptl_t; +typedef struct lib_ac_t lib_ac_t; +typedef struct lib_me_t lib_me_t; +typedef struct lib_md_t lib_md_t; +typedef struct lib_eq_t lib_eq_t; + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} ptl_handle_wire_t; + +/* byte-flip insensitive! */ +#define PTL_WIRE_HANDLE_NONE \ +((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) + +typedef enum { + PTL_MSG_ACK = 0, + PTL_MSG_PUT, + PTL_MSG_GET, + PTL_MSG_REPLY, + PTL_MSG_HELLO, +} ptl_msg_type_t; + +/* Each of these structs should start with an odd number of + * __u32, or the compiler could add its own padding and confuse + * everyone. + * + * Also, "length" needs to be at offset 28 of each struct. + */ +typedef struct ptl_ack { + ptl_size_t mlength; + ptl_handle_wire_t dst_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for acks) moving out RSN */ +} ptl_ack_t; + +typedef struct ptl_put { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t ack_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length moving out RSN */ + ptl_size_t offset; + ptl_hdr_data_t hdr_data; +} ptl_put_t; + +typedef struct ptl_get { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t return_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for gets) moving out RSN */ + ptl_size_t src_offset; + ptl_size_t return_offset; /* unused: going RSN */ + ptl_size_t sink_length; +} ptl_get_t; + +typedef struct ptl_reply { + __u32 unused1; /* unused fields going RSN */ + ptl_handle_wire_t dst_wmd; + ptl_size_t dst_offset; /* unused: going RSN */ + __u32 unused2; + ptl_size_t length; /* common length moving out RSN */ +} ptl_reply_t; + +typedef struct { + ptl_nid_t dest_nid; + ptl_nid_t src_nid; + ptl_pid_t dest_pid; + ptl_pid_t src_pid; + __u32 type; /* ptl_msg_type_t */ + union { + ptl_ack_t ack; + ptl_put_t put; + ptl_get_t get; + ptl_reply_t reply; + } msg; +} ptl_hdr_t; + +/* All length fields in individual unions at same offset */ +/* LASSERT for same in lib-move.c */ +#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length) + +/* A HELLO message contains the portals magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * PTL_MSG_HELLO in the type field. All other fields are zero (including + * PTL_HDR_LENGTH; i.e. no payload). + * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID, so that hosts with + * multiple IP interfaces can have a single NID. These NALs should exchange + * HELLO messages when a connection is first established. */ +typedef struct { + __u32 magic; /* PORTALS_PROTO_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} ptl_magicversion_t; + +#define PORTALS_PROTO_MAGIC 0xeebc0ded + +#define PORTALS_PROTO_VERSION_MAJOR 0 +#define PORTALS_PROTO_VERSION_MINOR 1 + +typedef struct { + long recv_count, recv_length, send_count, send_length, drop_count, + drop_length, msgs_alloc, msgs_max; +} lib_counters_t; + +/* temporary expedient: limit number of entries in discontiguous MDs */ +#if PTL_LARGE_MTU +# define PTL_MD_MAX_IOV 64 +#else +# define PTL_MD_MAX_IOV 16 +#endif + +struct lib_msg_t { + struct list_head msg_list; + int send_ack; + lib_md_t *md; + ptl_nid_t nid; + ptl_pid_t pid; + ptl_event_t ev; + ptl_handle_wire_t ack_wmd; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } msg_iov; +}; + +struct lib_ptl_t { + ptl_pt_index_t size; + struct list_head *tbl; +}; + +struct lib_ac_t { + int next_free; +}; + +typedef struct { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lib_handle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +struct lib_eq_t { + struct list_head eq_list; + lib_handle_t eq_lh; + ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + int eq_refcount; + int (*event_callback) (ptl_event_t * event); + void *eq_addrkey; +}; + +struct lib_me_t { + struct list_head me_list; + lib_handle_t me_lh; + ptl_process_id_t match_id; + ptl_match_bits_t match_bits, ignore_bits; + ptl_unlink_t unlink; + lib_md_t *md; +}; + +struct lib_md_t { + struct list_head md_list; + lib_handle_t md_lh; + lib_me_t *me; + user_ptr start; + ptl_size_t offset; + ptl_size_t length; + ptl_size_t max_size; + int threshold; + int pending; + ptl_unlink_t unlink; + unsigned int options; + unsigned int md_flags; + void *user_ptr; + lib_eq_t *eq; + void *md_addrkey; + unsigned int md_niov; /* # frags */ + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } md_iov; +}; + +#define PTL_MD_FLAG_UNLINK (1 << 0) +#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) + +#ifndef PTL_USE_SLAB_CACHE +typedef struct +{ + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lib_freelist_t; + +typedef struct +{ + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lib_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* stash in ni.ni_test_peers */ + ptl_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lib_test_peer_t; + +typedef struct { + int up; + int refcnt; + ptl_nid_t nid; + ptl_pid_t pid; + int num_nodes; + unsigned int debug; + lib_ptl_t tbl; + lib_ac_t ac; + lib_counters_t counters; + + int ni_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ni_next_object_cookie; /* cookie generator */ + __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ + + struct list_head ni_test_peers; + +#ifndef PTL_USE_SLAB_CACHE + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; +} lib_ni_t; + +#endif diff --git a/lnet/include/lnet/list.h b/lnet/include/lnet/list.h new file mode 100644 index 0000000..41613ab --- /dev/null +++ b/lnet/include/lnet/list.h @@ -0,0 +1,246 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#endif + +#ifndef list_for_each_entry +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) +#endif + +#ifndef list_for_each_entry_safe +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: the &struct list_head to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = pos->member.next; \ + &pos->member != (head); \ + pos = list_entry(n, typeof(*pos), member), \ + n = pos->member.next) +#endif diff --git a/lnet/include/lnet/lltrace.h b/lnet/include/lnet/lltrace.h new file mode 100644 index 0000000..7d1b304 --- /dev/null +++ b/lnet/include/lnet/lltrace.h @@ -0,0 +1,175 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Compile with: + * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl + */ +#ifndef __LTRACE_H_ +#define __LTRACE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline int ltrace_write_file(char* fname) +{ + char* argv[3]; + + argv[0] = "debug_kernel"; + argv[1] = fname; + argv[2] = "1"; + + fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]); + + return jt_dbg_debug_kernel(3, argv); +} + +static inline int ltrace_clear() +{ + char* argv[1]; + + argv[0] = "clear"; + + fprintf(stderr, "[ptlctl] %s\n", argv[0]); + + return jt_dbg_clear_debug_buf(1, argv); +} + +static inline int ltrace_mark(int indent_level, char* text) +{ + char* argv[2]; + char mark_buf[PATH_MAX]; + + snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text); + + argv[0] = "mark"; + argv[1] = mark_buf; + return jt_dbg_mark_debug_buf(2, argv); +} + +static inline int ltrace_applymasks() +{ + char* argv[2]; + argv[0] = "list"; + argv[1] = "applymasks"; + + fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]); + + return jt_dbg_list(2, argv); +} + + +static inline int ltrace_filter(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "filter"; + argv[1] = subsys_or_mask; + return jt_dbg_filter(2, argv); +} + +static inline int ltrace_show(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "show"; + argv[1] = subsys_or_mask; + return jt_dbg_show(2, argv); +} + +static inline int ltrace_start() +{ + int rc = 0; + dbg_initialize(0, NULL); +#ifdef PORTALS_DEV_ID + rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); +#endif + ltrace_filter("class"); + ltrace_filter("socknal"); + ltrace_filter("qswnal"); + ltrace_filter("gmnal"); + ltrace_filter("portals"); + + ltrace_show("all_types"); + ltrace_filter("trace"); + ltrace_filter("malloc"); + ltrace_filter("net"); + ltrace_filter("page"); + ltrace_filter("other"); + ltrace_filter("info"); + ltrace_applymasks(); + + return rc; +} + + +static inline void ltrace_stop() +{ +#ifdef PORTALS_DEV_ID + unregister_ioc_dev(PORTALS_DEV_ID); +#endif +} + +static inline int not_uml() +{ + /* Return Values: + * 0 when run under UML + * 1 when run on host + * <0 when lookup failed + */ + struct stat buf; + int rc = stat("/dev/ubd", &buf); + rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; + if (rc<0) { + fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); + rc = 1; /* Assume host */ + } + return rc; +} + +#define LTRACE_MAX_NOB 256 +static inline void ltrace_add_processnames(char* fname) +{ + char cmdbuf[LTRACE_MAX_NOB]; + struct timeval tv; + struct timezone tz; + int nob; + int underuml = !not_uml(); + + gettimeofday(&tv, &tz); + + nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \""); + + /* Careful - these format strings need to match the CDEBUG + * formats in portals/linux/debug.c EXACTLY + */ + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ", + S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec); + + if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d | %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L); + } + else { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0L); + } + + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname); + system(cmdbuf); +} + +#endif diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lnet/include/lnet/lnet.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h new file mode 100644 index 0000000..fdaae69 --- /dev/null +++ b/lnet/include/lnet/lnetctl.h @@ -0,0 +1,74 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h new file mode 100644 index 0000000..6a61fd5 --- /dev/null +++ b/lnet/include/lnet/myrnal.h @@ -0,0 +1,27 @@ +/* +** $Id: myrnal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ + +#ifndef MYRNAL_H +#define MYRNAL_H + +#define MAX_ARGS_LEN (256) +#define MAX_RET_LEN (128) +#define MYRNAL_MAX_ACL_SIZE (64) +#define MYRNAL_MAX_PTL_SIZE (64) + +#define P3CMD (100) +#define P3SYSCALL (200) +#define P3REGISTER (300) + +enum { PTL_MLOCKALL }; + +typedef struct { + void *args; + size_t args_len; + void *ret; + size_t ret_len; + int p3cmd; +} myrnal_forward_t; + +#endif /* MYRNAL_H */ diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h new file mode 100644 index 0000000..c1c50ed --- /dev/null +++ b/lnet/include/lnet/nal.h @@ -0,0 +1,50 @@ +/* +** $Id: nal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ +#ifndef _NAL_H_ +#define _NAL_H_ + +/* + * p30/nal.h + * + * The API side NAL declarations + */ + +#include + +#ifdef yield +#undef yield +#endif + +typedef struct nal_t nal_t; + +struct nal_t { + ptl_ni_t ni; + int refct; + void *nal_data; + int *timeout; /* for libp30api users */ + int (*forward) (nal_t * nal, int index, /* Function ID */ + void *args, size_t arg_len, void *ret, size_t ret_len); + + int (*shutdown) (nal_t * nal, int interface); + + int (*validate) (nal_t * nal, void *base, size_t extent); + + void (*yield) (nal_t * nal); + + void (*lock) (nal_t * nal, unsigned long *flags); + + void (*unlock) (nal_t * nal, unsigned long *flags); +}; + +typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); + +extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); + +#ifndef PTL_IFACE_DEFAULT +#define PTL_IFACE_DEFAULT (PTL_IFACE_IP) +#endif + +#endif diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h new file mode 100644 index 0000000..1b837b4 --- /dev/null +++ b/lnet/include/lnet/nalids.h @@ -0,0 +1,4 @@ +#define PTL_IFACE_TCP 1 +#define PTL_IFACE_ER 2 +#define PTL_IFACE_SS 3 +#define PTL_IFACE_MAX 4 diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lnet/include/lnet/p30.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lnet/include/lnet/ppid.h b/lnet/include/lnet/ppid.h new file mode 100644 index 0000000..34e5dc5 --- /dev/null +++ b/lnet/include/lnet/ppid.h @@ -0,0 +1,53 @@ +/* + * TITLE(ppid_h, "@(#) $Id: ppid.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $"); + */ + +#ifndef _INCppidh_ +#define _INCppidh_ + +#include "defines.h" +// #include "idtypes.h" + + +#define MAX_PPID 1000 /* this needs to fit into 16 bits so the + maximum value is 65535. having it "large" + can help w/ debugging process accounting + but there are reasons for making it + somewhat smaller than the maximum -- + requiring storage for arrays that index + on the ppid, eg... */ + +#define MAX_GID 1000 /* this needs to fit into 16 bits... */ + +#define MAX_FIXED_PPID 100 +#define MAX_FIXED_GID 100 +#define PPID_FLOATING MAX_FIXED_PPID+1 /* Floating area starts here */ +#define GID_FLOATING MAX_FIXED_GID+1 /* Floating area starts here */ +#define NUM_PTL_TASKS MAX_FIXED_PPID+80 /* Maximum no. portals tasks */ + +#define PPID_AUTO 0 + +/* Minimum PPID is 1 */ +#define PPID_BEBOPD 1 /* bebopd */ +#define GID_BEBOPD 1 /* bebopd */ + +#define PPID_PCT 2 /* pct */ +#define GID_PCT 2 /* pct */ + +#define PPID_FYOD 3 /* fyod */ +#define GID_FYOD 3 /* fyod */ + +#define PPID_GDBWRAP 11 /* portals proxy for gdb */ +#define GID_GDBWRAP 11 /* portals proxy for gdb */ + +#define PPID_TEST 15 /* for portals tests */ +#define GID_TEST 15 + +#define GID_YOD 5 /* yod */ +#define GID_PINGD 6 /* pingd */ +#define GID_BT 7 /* bt */ +#define GID_PTLTEST 8 /* ptltest */ +#define GID_CGDB 9 /* cgdb */ +#define GID_TVDSVR 10 /* start-tvdsvr */ + +#endif /* _INCppidh_ */ diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h new file mode 100644 index 0000000..fdaae69 --- /dev/null +++ b/lnet/include/lnet/ptlctl.h @@ -0,0 +1,74 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h new file mode 100644 index 0000000..65ab189 --- /dev/null +++ b/lnet/include/lnet/stringtab.h @@ -0,0 +1,6 @@ +/* +** $Id: stringtab.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ +/* + * stringtab.h + */ diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h new file mode 100644 index 0000000..d4038b6 --- /dev/null +++ b/lnet/include/lnet/types.h @@ -0,0 +1,157 @@ +#ifndef _P30_TYPES_H_ +#define _P30_TYPES_H_ + +#ifdef __linux__ +#include +#include +#else +#include +typedef u_int32_t __u32; +typedef u_int64_t __u64; +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) { return 0; } +#endif + +typedef __u64 ptl_nid_t; +typedef __u32 ptl_pid_t; +typedef __u32 ptl_pt_index_t; +typedef __u32 ptl_ac_index_t; +typedef __u64 ptl_match_bits_t; +typedef __u64 ptl_hdr_data_t; +typedef __u32 ptl_size_t; + +typedef struct { + unsigned long nal_idx; /* which network interface */ + __u64 cookie; /* which thing on that interface */ +} ptl_handle_any_t; + +typedef ptl_handle_any_t ptl_handle_ni_t; +typedef ptl_handle_any_t ptl_handle_eq_t; +typedef ptl_handle_any_t ptl_handle_md_t; +typedef ptl_handle_any_t ptl_handle_me_t; + +#define PTL_HANDLE_NONE \ +((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) +#define PTL_EQ_NONE PTL_HANDLE_NONE + +static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +{ + return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); +} + +#define PTL_NID_ANY ((ptl_nid_t) -1) +#define PTL_PID_ANY ((ptl_pid_t) -1) + +typedef struct { + ptl_nid_t nid; + ptl_pid_t pid; /* node id / process id */ +} ptl_process_id_t; + +typedef enum { + PTL_RETAIN = 0, + PTL_UNLINK +} ptl_unlink_t; + +typedef enum { + PTL_INS_BEFORE, + PTL_INS_AFTER +} ptl_ins_pos_t; + +typedef struct { + struct page *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; +} ptl_kiov_t; + +typedef struct { + void *start; + ptl_size_t length; + int threshold; + int max_size; + unsigned int options; + void *user_ptr; + ptl_handle_eq_t eventq; + unsigned int niov; +} ptl_md_t; + +/* Options for the MD structure */ +#define PTL_MD_OP_PUT (1 << 0) +#define PTL_MD_OP_GET (1 << 1) +#define PTL_MD_MANAGE_REMOTE (1 << 2) +#define PTL_MD_AUTO_UNLINK (1 << 3) +#define PTL_MD_TRUNCATE (1 << 4) +#define PTL_MD_ACK_DISABLE (1 << 5) +#define PTL_MD_IOV (1 << 6) +#define PTL_MD_MAX_SIZE (1 << 7) +#define PTL_MD_KIOV (1 << 8) + +#define PTL_MD_THRESH_INF (-1) + +typedef enum { + PTL_EVENT_GET, + PTL_EVENT_PUT, + PTL_EVENT_REPLY, + PTL_EVENT_ACK, + PTL_EVENT_SENT +} ptl_event_kind_t; + +#define PTL_SEQ_BASETYPE long +typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; +#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) + +typedef struct { + ptl_event_kind_t type; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength, mlength, offset; + ptl_handle_me_t unlinked_me; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + cycles_t arrival_time; + volatile ptl_seq_t sequence; +} ptl_event_t; + + +typedef enum { + PTL_ACK_REQ, + PTL_NOACK_REQ +} ptl_ack_req_t; + + +typedef struct { + volatile ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + ptl_handle_any_t cb_eq_handle; +} ptl_eq_t; + +typedef struct { + ptl_eq_t *eq; +} ptl_ni_t; + + +typedef struct { + int max_match_entries; /* max number of match entries */ + int max_mem_descriptors; /* max number of memory descriptors */ + int max_event_queues; /* max number of event queues */ + int max_atable_index; /* maximum access control list table index */ + int max_ptable_index; /* maximum portals table index */ +} ptl_ni_limits_t; + +/* + * Status registers + */ +typedef enum { + PTL_SR_DROP_COUNT, + PTL_SR_DROP_LENGTH, + PTL_SR_RECV_COUNT, + PTL_SR_RECV_LENGTH, + PTL_SR_SEND_COUNT, + PTL_SR_SEND_LENGTH, + PTL_SR_MSGS_MAX, +} ptl_sr_index_t; + +typedef int ptl_sr_value_t; + +#endif diff --git a/lnet/klnds/Makefile.am b/lnet/klnds/Makefile.am new file mode 100644 index 0000000..5c6085e --- /dev/null +++ b/lnet/klnds/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@ diff --git a/lnet/klnds/Makefile.mk b/lnet/klnds/Makefile.mk new file mode 100644 index 0000000..ce40a60 --- /dev/null +++ b/lnet/klnds/Makefile.mk @@ -0,0 +1,4 @@ +include ../Kernelenv + +obj-y = socknal/ +# more coming... \ No newline at end of file diff --git a/lnet/klnds/gmlnd/Makefile.am b/lnet/klnds/gmlnd/Makefile.am new file mode 100644 index 0000000..1dc6f4e --- /dev/null +++ b/lnet/klnds/gmlnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kgmnal +modulenet_DATA = kgmnal.o +EXTRA_PROGRAMS = kgmnal + +DEFS = +kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h diff --git a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch new file mode 100644 index 0000000..23c80d9 --- /dev/null +++ b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch @@ -0,0 +1,43 @@ +diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c +--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002 ++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002 +@@ -30,6 +30,8 @@ + * + ************************************************************************/ + ++#define EXPORT_SYMTAB ++ + #include + #include + +@@ -4075,6 +4077,28 @@ + return 0; + } + ++EXPORT_SYMBOL(gm_blocking_receive_no_spin); ++EXPORT_SYMBOL(gm_close); ++EXPORT_SYMBOL(gm_dma_free); ++EXPORT_SYMBOL(gm_dma_malloc); ++EXPORT_SYMBOL(gm_drop_sends); ++EXPORT_SYMBOL(gm_finalize); ++EXPORT_SYMBOL(gm_get_node_id); ++EXPORT_SYMBOL(gm_init); ++EXPORT_SYMBOL(gm_initialize_alarm); ++EXPORT_SYMBOL(gm_max_node_id_in_use); ++EXPORT_SYMBOL(gm_min_size_for_length); ++EXPORT_SYMBOL(gm_num_receive_tokens); ++EXPORT_SYMBOL(gm_num_send_tokens); ++EXPORT_SYMBOL(gm_open); ++EXPORT_SYMBOL(gm_provide_receive_buffer); ++EXPORT_SYMBOL(gm_resume_sending); ++EXPORT_SYMBOL(gm_send_with_callback); ++EXPORT_SYMBOL(gm_set_acceptable_sizes); ++EXPORT_SYMBOL(gm_set_alarm); ++EXPORT_SYMBOL(gm_unknown); ++ ++ + /* + This file uses GM standard indentation. + +Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~ +Only in gm-1.5.2.1_Linux-cfs/: trace diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h new file mode 100644 index 0000000..47e8c3c --- /dev/null +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -0,0 +1,101 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _GMNAL_H +#define _GMNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_GMNAL + +#include +#include +#include + +#include + + +/* + * Myrinet GM NAL + */ +#define NPAGES_LARGE 16 +#define NPAGES_SMALL 1 +#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE +#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE +#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE)) +#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL)) + +#define TXMSGS 64 /* Number of Transmit Messages */ +#define ENVELOPES 8 /* Number of outstanding receive msgs */ + +#define KGM_PORT_NUM 3 +#define KGM_HOSTNAME "kgmnal" + + +typedef struct { + char *krx_buffer; + unsigned long krx_len; + unsigned int krx_size; + unsigned int krx_priority; + struct list_head krx_item; +} kgmnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + char *ktx_buffer; + size_t ktx_len; + unsigned long ktx_size; + int ktx_ndx; + unsigned int ktx_priority; + unsigned int ktx_tgt_node; + unsigned int ktx_tgt_port_id; +} kgmnal_tx_t; + + +typedef struct { + char kgm_init; + char kgm_shuttingdown; + struct gm_port *kgm_port; + struct list_head kgm_list; + ptl_nid_t kgm_nid; + nal_cb_t *kgm_cb; + struct kgm_trans *kgm_trans; + struct tq_struct kgm_ready_tq; + spinlock_t kgm_dispatch_lock; + spinlock_t kgm_update_lock; + spinlock_t kgm_send_lock; +} kgmnal_data_t; + +int kgm_init(kgmnal_data_t *kgm_data); +int kgmnal_recv_thread(void *); +int gm_return_mynid(void); +void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +extern kgmnal_data_t kgmnal_data; +extern nal_t kgmnal_api; +extern nal_cb_t kgmnal_lib; + +#endif /* _GMNAL_H */ + diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c new file mode 100644 index 0000000..3d4c86d --- /dev/null +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -0,0 +1,517 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* TODO + * preallocate send buffers, store on list + * put receive buffers on queue, handle with receive threads + * use routing + */ + +#include "gmnal.h" + +extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int); + +static kgmnal_tx_t * +get_trans(void) +{ + kgmnal_tx_t *t; + PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t))); + return t; +} + +static void +put_trans(kgmnal_tx_t *t) +{ + PORTAL_FREE(t, sizeof(kgmnal_tx_t)); +} + +int +kgmnal_ispeer (ptl_nid_t nid) +{ + unsigned int gmnid = (unsigned int)nid; + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */ + gmnid < nnids); /* it's in this machine */ +} + +/* + * LIB functions follow + * + */ +static int +kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static int +kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static void * +kgmnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + +static void +kgmnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kgmnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kgmnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kgm_dispatch_lock,*flags); +} + + +static void +kgmnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags); +} + + +static int +kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +/* FIXME rmr: add rounting code here */ +static void +kgmnal_tx_done(kgmnal_tx_t *trans, int error) +{ + lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie); + + gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer); + + trans->ktx_buffer = NULL; + trans->ktx_len = 0; + + put_trans(trans); +} +static char * gm_error_strings[GM_NUM_STATUS_CODES] = { + [GM_SUCCESS] = "GM_SUCCESS", + [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT", + [GM_SEND_REJECTED] = "GM_SEND_REJECTED", + [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED", + [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE", + [GM_SEND_DROPPED] = "GM_SEND_DROPPED", + [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED", +}; + +inline char * get_error(int status) +{ + if (gm_error_strings[status] != NULL) + return gm_error_strings[status]; + else + return "Unknown error"; +} + +static void +kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status) +{ + CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status); +} + +static void +kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status) +{ + kgmnal_tx_t *ktx = (kgmnal_tx_t *)context; + int err = 0; + + LASSERT (p != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id); + + switch((int)status) { + case GM_SUCCESS: /* normal */ + break; + case GM_SEND_TIMED_OUT: /* application error */ + case GM_SEND_REJECTED: /* size of msg unacceptable */ + case GM_SEND_TARGET_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_TARGET_NODE_UNREACHABLE: + case GM_SEND_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_DROPPED: + CERROR("%s (%d):\n", get_error(status), status); + err = -EIO; + break; + default: + CERROR("Unknown status: %d\n", status); + err = -EIO; + break; + } + + kgmnal_tx_done(ktx, err); +} + +/* + */ + +static int +kgmnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t len) +{ + /* + * ipnal assumes that this is the private as passed to lib_dispatch.. + * so do we :/ + */ + kgmnal_tx_t *ktx=NULL; + int rc=0; + void * buf; + int buf_len = sizeof(ptl_hdr_t) + len; + int buf_size = 0; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + PROF_START(gmnal_send); + + + CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n", + len, iov, nid, KGM_PORT_NUM); + + /* ensure there is an available tx handle */ + + /* save transaction info to trans for later finalize and cleanup */ + ktx = get_trans(); + if (ktx == NULL) { + rc = -ENOMEM; + goto send_exit; + } + + /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce + header and data. + Also, memory must be dma'able or registered with GM. */ + + if (buf_len <= MSG_LEN_SMALL) { + buf_size = MSG_SIZE_SMALL; + } else if (buf_len <= MSG_LEN_LARGE) { + buf_size = MSG_SIZE_LARGE; + } else { + printk("kgmnal:request exceeds TX MTU size (%d).\n", + MSG_SIZE_LARGE); + rc = -1; + goto send_exit; + } + + buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len); + if (buf == NULL) { + rc = -ENOMEM; + goto send_exit; + } + memcpy(buf, hdr, sizeof(ptl_hdr_t)); + + if (len != 0) + lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), + options, niov, iov, len); + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + ktx->ktx_len = buf_len; + ktx->ktx_size = buf_size; + ktx->ktx_buffer = buf; + ktx->ktx_priority = GM_LOW_PRIORITY; + ktx->ktx_tgt_node = nid; + ktx->ktx_tgt_port_id = KGM_PORT_NUM; + + CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx " + "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM, + GM_LOW_PRIORITY); + + gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size, + buf_len, GM_LOW_PRIORITY, + nid, KGM_PORT_NUM, + kgmnal_txhandler, ktx); + + PROF_FINISH(gmnal_send); + send_exit: + return rc; +} +void +kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + CERROR ("forwarding not implemented\n"); +} + + +static inline void +kgmnal_requeue_rx(kgmnal_rx_t *krx) +{ + gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer, + krx->krx_size, krx->krx_priority); +} + +/* Process a received portals packet */ + +/* Receive Interrupt Handler */ +static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size, + void * buf, unsigned int pri) +{ + ptl_hdr_t *hdr = buf; + kgmnal_rx_t krx; + + CDEBUG(D_NET,"buf %p, len %ld\n", buf, len); + + if ( len < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (kgm->kgm_shuttingdown) + return; + CERROR("kgmnal: did not receive complete portal header, " + "len= %ld", len); + gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri); + return; + } + + /* might want to use seperate threads to handle receive */ + krx.krx_buffer = buf; + krx.krx_len = len; + krx.krx_size = size; + krx.krx_priority = pri; + + if ( hdr->dest_nid == kgmnal_lib.ni.nid ) { + PROF_START(lib_parse); + lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); + PROF_FINISH(lib_parse); + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx: target is " + "a peer", hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); + } else { + /* forward to gateway */ + CERROR("forwarding not implemented yet"); + kgmnal_requeue_rx(&krx); + } + + return; +} + + +static int kgmnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t mlen, + size_t rlen) +{ + kgmnal_rx_t *krx = private; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen); + + /* What was actually received must be >= what sender claims to + * have sent. This is an LASSERT, since lib-move doesn't + * check cb return code yet. */ + LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + LASSERT (mlen <= rlen); + + PROF_START(gmnal_recv); + + if(mlen != 0) { + PROF_START(memcpy); + lib_copy_buf2iov (options, niov, iov, + krx->krx_buffer + sizeof (ptl_hdr_t), mlen); + PROF_FINISH(memcpy); + } + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + kgmnal_requeue_rx(krx); + + PROF_FINISH(gmnal_recv); + + return rlen; +} + + +static void kgmnal_shutdown(void * none) +{ + CERROR("called\n"); + return; +} + +/* + * Set terminate and use alarm to wake up the recv thread. + */ +static void recv_shutdown(kgmnal_data_t *kgm) +{ + gm_alarm_t alarm; + + kgm->kgm_shuttingdown = 1; + gm_initialize_alarm(&alarm); + gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL); +} + +int kgmnal_end(kgmnal_data_t *kgm) +{ + + /* wait for sends to finish ? */ + /* remove receive buffers */ + /* shutdown receive thread */ + + recv_shutdown(kgm); + + return 0; +} + +/* Used only for the spinner */ +int kgmnal_recv_thread(void *arg) +{ + kgmnal_data_t *kgm = arg; + + LASSERT(kgm != NULL); + + kportal_daemonize("kgmnal_rx"); + + while(1) { + gm_recv_event_t *e; + int priority = GM_LOW_PRIORITY; + if (kgm->kgm_shuttingdown) + break; + + e = gm_blocking_receive_no_spin(kgm->kgm_port); + if (e == NULL) { + CERROR("gm_blocking_receive returned NULL\n"); + break; + } + + switch(gm_ntohc(e->recv.type)) { + case GM_HIGH_RECV_EVENT: + priority = GM_HIGH_PRIORITY; + /* fall through */ + case GM_RECV_EVENT: + kgmnal_rx(kgm, gm_ntohl(e->recv.length), + gm_ntohc(e->recv.size), + gm_ntohp(e->recv.buffer), priority); + break; + case GM_ALARM_EVENT: + CERROR("received alarm"); + gm_unknown(kgm->kgm_port, e); + break; + case GM_BAD_SEND_DETECTED_EVENT: /* ?? */ + CERROR("received bad send!\n"); + break; + default: + gm_unknown(kgm->kgm_port, e); + } + } + + CERROR("shuttting down.\n"); + return 0; +} + +nal_cb_t kgmnal_lib = { + nal_data: &kgmnal_data, /* NAL private data */ + cb_send: kgmnal_send, + cb_recv: kgmnal_recv, + cb_read: kgmnal_read, + cb_write: kgmnal_write, + cb_malloc: kgmnal_malloc, + cb_free: kgmnal_free, + cb_printf: kgmnal_printf, + cb_cli: kgmnal_cli, + cb_sti: kgmnal_sti, + cb_dist: kgmnal_dist +}; diff --git a/lnet/klnds/gmlnd/gmnal.c b/lnet/klnds/gmlnd/gmnal.c new file mode 100644 index 0000000..ceeea2a --- /dev/null +++ b/lnet/klnds/gmlnd/gmnal.c @@ -0,0 +1,284 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "gmnal.h" + +ptl_handle_ni_t kgmnal_ni; +nal_t kgmnal_api; + +kgmnal_data_t kgmnal_data; +int gmnal_debug = 0; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: GMNAL, + kprni_arg: NULL, + kprni_fwd: kgmnal_fwd_packet, +}; + +static int kgmnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return PTL_OK; +} + +static void kgmnal_lock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void kgmnal_unlock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int kgmnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kgmnal_api); + return 0; +} + +static void kgmnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kgmnal_api); + + if (current->need_resched) + schedule(); + return; +} + +kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx) +{ + kgmnal_rx_t *conn; + + PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t)); + /* Check for out of mem here */ + if (conn==NULL) { + printk("kgm_add_recv: memory alloc failed\n"); + return NULL; + } + + list_add(&conn->krx_item,(struct list_head *)&data->kgm_list); + // conn->ndx=ndx; + // conn->len=conn->ptlhdr_copied=0; + // conn->loopback=0; + return conn; +} + +static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n", + kgmnal_data.kgm_nid, nnids); + lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size); + return &kgmnal_api; +} + +static void __exit +kgmnal_finalize(void) +{ + struct list_head *tmp; + + PORTAL_SYMBOL_UNREGISTER (kgmnal_ni); + PtlNIFini(kgmnal_ni); + lib_fini(&kgmnal_api); + + if (kgmnal_data.kgm_port) { + gm_close(kgmnal_data.kgm_port); + } + + /* FIXME: free dma buffers */ + /* FIXME: kill receiver thread */ + + PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS); + + list_for_each(tmp, &kgmnal_data.kgm_list) { + kgmnal_rx_t *conn; + conn = list_entry(tmp, kgmnal_rx_t, krx_item); + CDEBUG(D_IOCTL, "freeing conn %p\n",conn); + tmp = tmp->next; + list_del(&conn->krx_item); + PORTAL_FREE(conn, sizeof(*conn)); + } + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + +static int __init +kgmnal_initialize(void) +{ + int rc; + int ntok; + unsigned long sizemask; + unsigned int nid; + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kgmnal_api.forward = kgmnal_forward; + kgmnal_api.shutdown = kgmnal_shutdown; + kgmnal_api.yield = kgmnal_yield; + kgmnal_api.validate = NULL; /* our api validate is a NOOP */ + kgmnal_api.lock= kgmnal_lock; + kgmnal_api.unlock= kgmnal_unlock; + kgmnal_api.nal_data = &kgmnal_data; + + kgmnal_lib.nal_data = &kgmnal_data; + + memset(&kgmnal_data, 0, sizeof(kgmnal_data)); + + INIT_LIST_HEAD(&kgmnal_data.kgm_list); + kgmnal_data.kgm_cb = &kgmnal_lib; + + /* Allocate transmit descriptors */ + PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS); + if (kgmnal_data.kgm_trans==NULL) { + printk("kgmnal: init: failed to allocate transmit " + "descriptors\n"); + return -1; + } + memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS)); + + spin_lock_init(&kgmnal_data.kgm_dispatch_lock); + spin_lock_init(&kgmnal_data.kgm_update_lock); + spin_lock_init(&kgmnal_data.kgm_send_lock); + + /* Do the receiver and xmtr allocation */ + + rc = gm_init(); + if (rc != GM_SUCCESS) { + CERROR("gm_init failed: %d\n", rc); + return -1; + } + + rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME, + GM_API_VERSION_1_1); + if (rc != GM_SUCCESS) { + gm_finalize(); + kgmnal_data.kgm_port = NULL; + CERROR("gm_open failed: %d\n", rc); + return -1; + } + gm_get_node_id(kgmnal_data.kgm_port, &nid); + kgmnal_data.kgm_nid = nid; + /* Allocate 2 different sizes of buffers. For new, use half + the tokens for each. */ + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n", + ntok, MSG_LEN_LARGE); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_LARGE); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_LARGE, GM_LOW_PRIORITY); + } + + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n", + ntok, MSG_LEN_SMALL); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_SMALL); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + } + sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL); + CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n", + kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY, + sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0); + + /* Initialize Network Interface */ + rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + return (-ENOMEM); + } + + /* Start receiver thread */ + kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0); + + PORTAL_SYMBOL_REGISTER(kgmnal_ni); + + kgmnal_data.kgm_init = 1; + + return 0; +} + +MODULE_AUTHOR("Robert Read "); +MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1"); +MODULE_LICENSE("GPL"); + +module_init (kgmnal_initialize); +module_exit (kgmnal_finalize); + +EXPORT_SYMBOL (kgmnal_ni); diff --git a/lnet/klnds/qswlnd/Makefile.am b/lnet/klnds/qswlnd/Makefile.am new file mode 100644 index 0000000..6759b96 --- /dev/null +++ b/lnet/klnds/qswlnd/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kqswnal +modulenet_DATA = kqswnal.o +EXTRA_PROGRAMS = kqswnal + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c new file mode 100644 index 0000000..d64b7ad --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -0,0 +1,578 @@ +/* + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +ptl_handle_ni_t kqswnal_ni; +nal_t kqswnal_api; +kqswnal_data_t kqswnal_data; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: QSWNAL, + kprni_arg: NULL, + kprni_fwd: kqswnal_fwd_packet, +}; + + +static int +kqswnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return (PTL_OK); +} + +static void +kqswnal_lock (nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void +kqswnal_unlock(nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int +kqswnal_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "shutdown\n"); + + LASSERT (nal == &kqswnal_api); + return (0); +} + +static void +kqswnal_yield( nal_t *nal ) +{ + CDEBUG (D_NET, "yield\n"); + + if (current->need_resched) + schedule(); + return; +} + +static nal_t * +kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, + ptl_pid_t requested_pid) +{ + ptl_nid_t mynid = ep_nodeid (kqswnal_data.kqn_epdev); + int nnids = ep_numnodes (kqswnal_data.kqn_epdev); + + CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid,nnids); + + lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size); + + return (&kqswnal_api); +} + +void __exit +kqswnal_finalise (void) +{ + switch (kqswnal_data.kqn_init) + { + default: + LASSERT (0); + + case KQN_INIT_ALL: + PORTAL_SYMBOL_UNREGISTER (kqswnal_ni); + /* fall through */ + + case KQN_INIT_PTL: + PtlNIFini (kqswnal_ni); + lib_fini (&kqswnal_lib); + /* fall through */ + + case KQN_INIT_DATA: + break; + + case KQN_INIT_NOTHING: + return; + } + + /**********************************************************************/ + /* Make router stop her calling me and fail any more call-ins */ + kpr_shutdown (&kqswnal_data.kqn_router); + + /**********************************************************************/ + /* flag threads to terminate, wake them and wait for them to die */ + + kqswnal_data.kqn_shuttingdown = 1; + wake_up_all (&kqswnal_data.kqn_sched_waitq); + + while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { + CDEBUG(D_NET, "waiting for %d threads to terminate\n", + atomic_read (&kqswnal_data.kqn_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + /**********************************************************************/ + /* close elan comms */ + + if (kqswnal_data.kqn_eprx_small != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); + + if (kqswnal_data.kqn_eprx_large != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); + + if (kqswnal_data.kqn_eptx != NULL) + ep_free_large_xmtr (kqswnal_data.kqn_eptx); + + /**********************************************************************/ + /* No more threads. No more portals, router or comms callbacks! + * I control the horizontals and the verticals... + */ + + /**********************************************************************/ + /* Complete any blocked forwarding packets with error + */ + + while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + while (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + /**********************************************************************/ + /* Wait for router to complete any packets I sent her + */ + + kpr_deregister (&kqswnal_data.kqn_router); + + + /**********************************************************************/ + /* Unmap message buffers and free all descriptors and buffers + */ + + if (kqswnal_data.kqn_eprxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, 0, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle); + } + + if (kqswnal_data.kqn_eptxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, 0, + KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle); + } + + if (kqswnal_data.kqn_txds != NULL) + { + int i; + + for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) + { + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + } + + PORTAL_FREE(kqswnal_data.kqn_txds, + sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + } + + if (kqswnal_data.kqn_rxds != NULL) + { + int i; + int j; + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + for (j = 0; j < krx->krx_npages; j++) + if (krx->krx_pages[j] != NULL) + __free_page (krx->krx_pages[j]); + } + + PORTAL_FREE(kqswnal_data.kqn_rxds, + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGS_LARGE)); + } + + /* resets flags, pointers to NULL etc */ + memset(&kqswnal_data, 0, sizeof (kqswnal_data)); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); + + printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kqswnal_initialise (void) +{ + ELAN3_DMA_REQUEST dmareq; + int rc; + int i; + int elan_page_idx; + int pkmem = atomic_read(&portal_kmemory); + + LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); + + kqswnal_api.forward = kqswnal_forward; + kqswnal_api.shutdown = kqswnal_shutdown; + kqswnal_api.yield = kqswnal_yield; + kqswnal_api.validate = NULL; /* our api validate is a NOOP */ + kqswnal_api.lock = kqswnal_lock; + kqswnal_api.unlock = kqswnal_unlock; + kqswnal_api.nal_data = &kqswnal_data; + + kqswnal_lib.nal_data = &kqswnal_data; + + /* ensure all pointers NULL etc */ + memset (&kqswnal_data, 0, sizeof (kqswnal_data)); + + kqswnal_data.kqn_cb = &kqswnal_lib; + + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + spin_lock_init (&kqswnal_data.kqn_idletxd_lock); + init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); + + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); + + spin_lock_init (&kqswnal_data.kqn_sched_lock); + init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); + + spin_lock_init (&kqswnal_data.kqn_statelock); + + /* pointers/lists/locks initialised */ + kqswnal_data.kqn_init = KQN_INIT_DATA; + + /**********************************************************************/ + /* Find the first Elan device */ + + kqswnal_data.kqn_epdev = ep_device (0); + if (kqswnal_data.kqn_epdev == NULL) + { + CERROR ("Can't get elan device 0\n"); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the transmitter */ + + kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev); + if (kqswnal_data.kqn_eptx == NULL) + { + CERROR ("Can't allocate transmitter\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the receivers */ + + kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_SMALL, + KQSW_EP_ENVELOPES_SMALL); + if (kqswnal_data.kqn_eprx_small == NULL) + { + CERROR ("Can't install small msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_LARGE, + KQSW_EP_ENVELOPES_LARGE); + if (kqswnal_data.kqn_eprx_large == NULL) + { + CERROR ("Can't install large msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for transmit buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEREAD; + + rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState, + KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), + &dmareq, &kqswnal_data.kqn_eptxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for receive buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEWRITE; + + rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, + &dmareq, &kqswnal_data.kqn_eprxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Allocate/Initialise transmit descriptors */ + + PORTAL_ALLOC(kqswnal_data.kqn_txds, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + if (kqswnal_data.kqn_txds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* clear flags, null pointers etc */ + memset(kqswnal_data.kqn_txds, 0, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) + { + int premapped_pages; + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + int basepage = i * KQSW_NTXMSGPAGES; + + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + if (ktx->ktx_buffer == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* Map pre-allocated buffer NOW, to save latency on transmit */ + premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, + basepage, &ktx->ktx_ebuffer); + + ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ + ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ + + if (i < KQSW_NTXMSGS) + ktx->ktx_idle = &kqswnal_data.kqn_idletxds; + else + ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + + list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + } + + /**********************************************************************/ + /* Allocate/Initialise receive descriptors */ + + PORTAL_ALLOC (kqswnal_data.kqn_rxds, + sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); + if (kqswnal_data.kqn_rxds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); + + elan_page_idx = 0; + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + E3_Addr elanaddr; + int j; + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + if (i < KQSW_NRXMSGS_SMALL) + { + krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; + krx->krx_eprx = kqswnal_data.kqn_eprx_small; + } + else + { + krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; + krx->krx_eprx = kqswnal_data.kqn_eprx_large; + } + + LASSERT (krx->krx_npages > 0); + for (j = 0; j < krx->krx_npages; j++) + { + krx->krx_pages[j] = alloc_page (GFP_KERNEL); + if (krx->krx_pages[j] == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + LASSERT(page_address(krx->krx_pages[j]) != NULL); + + elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, + page_address(krx->krx_pages[j]), + PAGE_SIZE, elan_page_idx, + &elanaddr); + elan_page_idx++; + + if (j == 0) + krx->krx_elanaddr = elanaddr; + + /* NB we assume a contiguous */ + LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE); + } + } + LASSERT (elan_page_idx == + (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + + (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); + + /**********************************************************************/ + /* Network interface ready to initialise */ + + rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni); + if (rc != 0) + { + CERROR ("PtlNIInit failed %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_init = KQN_INIT_PTL; + + /**********************************************************************/ + /* Queue receives, now that it's OK to run their completion callbacks */ + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + /* NB this enqueue can allocate/sleep (attr == 0) */ + rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, + krx->krx_elanaddr, + krx->krx_npages * PAGE_SIZE, 0); + if (rc != 0) + { + CERROR ("failed ep_queue_receive %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + } + + /**********************************************************************/ + /* Spawn scheduling threads */ + for (i = 0; i < smp_num_cpus; i++) + { + rc = kqswnal_thread_start (kqswnal_scheduler, NULL); + if (rc != 0) + { + CERROR ("failed to spawn scheduling thread: %d\n", rc); + kqswnal_finalise (); + return (rc); + } + } + + /**********************************************************************/ + /* Connect to the router */ + rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); + CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); + + PORTAL_SYMBOL_REGISTER(kqswnal_ni); + kqswnal_data.kqn_init = KQN_INIT_ALL; + + printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d " + "(Routing %s, initial mem %d)\n", + ep_nodeid (kqswnal_data.kqn_epdev), + ep_numnodes (kqswnal_data.kqn_epdev), + kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + + +MODULE_AUTHOR("W. Marcus Miller "); +MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00"); +MODULE_LICENSE("GPL"); + +module_init (kqswnal_initialise); +module_exit (kqswnal_finalise); + +EXPORT_SYMBOL (kqswnal_ni); diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h new file mode 100644 index 0000000..657b02b --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -0,0 +1,249 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _QSWNAL_H +#define _QSWNAL_H +#define EXPORT_SYMTAB + +#ifdef PROPRIETARY_ELAN +# include +#else +# include +#endif + +#undef printf /* nasty QSW #define */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_QSWNAL + +#include +#include +#include + +#define KQSW_CHECKSUM 0 +#if KQSW_CHECKSUM +typedef unsigned long kqsw_csum_t; +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#else +#define KQSW_CSUM_SIZE 0 +#endif +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) + +/* + * Elan NAL + */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +/* NB small/large message sizes are GLOBAL constants */ + +/* + * Performance Tuning defines + * NB no mention of PAGE_SIZE for interoperability + */ +#if PTL_LARGE_MTU +# define KQSW_MAXPAYLOAD (256<<10) /* biggest message this NAL will cope with */ +#else +# define KQSW_MAXPAYLOAD (64<<10) /* biggest message this NAL will cope with */ +#endif + +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ + +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ + +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 128 /* # reserved transmit messages if can't block */ + +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ + +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ + +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ + +/* + * derived constants + */ + +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +/* The pre-allocated tx buffer (hdr + small payload) */ + +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +/* Reserve elan address space for pre-allocated and pre-mapped transmit + * buffer and a full payload too. Extra pages allow for page alignment */ + +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) + +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +/* biggest complete packet we can receive (or transmit) */ + + +typedef struct +{ + struct list_head krx_list; /* enqueue -> thread */ + EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RXD *krx_rxd; /* receive descriptor (for repost) */ + E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ + int krx_npages; /* # pages in receive buffer */ + int krx_nob; /* Number Of Bytes received into buffer */ + kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ + struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ + struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ +} kqswnal_rx_t; + +typedef struct +{ + struct list_head ktx_list; /* enqueue idle/delayed */ + struct list_head *ktx_idle; /* where to put when idle */ + char ktx_state; /* What I'm doing */ + uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ + int ktx_npages; /* pages reserved for mapping messages */ + int ktx_nmappedpages; /* # pages mapped for current message */ + EP_IOVEC ktx_iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ + int ktx_niov; /* # message frags */ + int ktx_port; /* destination ep port */ + ptl_nid_t ktx_nid; /* destination node */ + void *ktx_args[2]; /* completion passthru */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ +} kqswnal_tx_t; + +#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ + +typedef struct +{ + char kqn_init; /* what's been initialised */ + char kqn_shuttingdown; /* I'm trying to shut down */ + atomic_t kqn_nthreads; /* # threads still running */ + + kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + + struct list_head kqn_idletxds; /* transmit descriptors free to use */ + struct list_head kqn_nblk_idletxds; /* reserve of */ + spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ + wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ + struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ + + spinlock_t kqn_sched_lock; /* serialise packet schedulers */ + wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ + + struct list_head kqn_readyrxds; /* rxds full of data */ + struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_delayedtxds; /* delayed transmits */ + + spinlock_t kqn_statelock; /* cb_cli/cb_sti */ + nal_cb_t *kqn_cb; /* -> kqswnal_lib */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ +} kqswnal_data_t; + +/* kqn_init state */ +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 + +extern nal_cb_t kqswnal_lib; +extern nal_t kqswnal_api; +extern kqswnal_data_t kqswnal_data; + +extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); +extern void kqswnal_rxhandler(EP_RXD *rxd); +extern int kqswnal_scheduler (void *); +extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +static inline void +kqswnal_requeue_rx (kqswnal_rx_t *krx) +{ + ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE); +} + +static inline int +kqswnal_pages_spanned (void *base, int nob) +{ + unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; + unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; + + LASSERT (last_page >= first_page); /* can't wrap address space */ + return (last_page - first_page + 1); +} + +#if KQSW_CHECKSUM +static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) +{ + unsigned char *ptr = (unsigned char *)base; + + while (nob-- > 0) + sum += *ptr++; + + return (sum); +} +#endif + +#endif /* _QSWNAL_H */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c new file mode 100644 index 0000000..5979885 --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -0,0 +1,1242 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +atomic_t kqswnal_packets_launched; +atomic_t kqswnal_packets_transmitted; +atomic_t kqswnal_packets_received; + + +/* + * LIB functions follow + * + */ +static int +kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static int +kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static void * +kqswnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return (buf); +} + +static void +kqswnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + + +static void +kqswnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kqn_statelock, *flags); +} + + +static void +kqswnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kqn_statelock, *flags); +} + + +static int +kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + *dist = (nid == nal->ni.nid) ? 0 : 1; + return (0); +} + +int +kqswnal_ispeer (ptl_nid_t nid) +{ + unsigned int elanid = (unsigned int)nid; + + /* didn't lose high bits on conversion and it's in this machine? */ + return ((ptl_nid_t)elanid == nid && + elanid < ep_numnodes (kqswnal_data.kqn_epdev)); +} + +void +kqswnal_unmap_tx (kqswnal_tx_t *ktx) +{ + if (ktx->ktx_nmappedpages == 0) + return; + + CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", + ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages); + + LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); + LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= + kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); + + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_basepage, ktx->ktx_nmappedpages); + ktx->ktx_nmappedpages = 0; +} + +int +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + char *ptr; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = kiov->kiov_len; + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + /* each frag fits in a page */ + LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + + nmapped++; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + /* XXX this is really crap, but we'll have to kmap until + * EKC has a page (rather than vaddr) mapping interface */ + + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, page %d, %d total\n", + ktx, nfrags, ptr, fraglen, basepage, nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ptr, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + + kunmap (kiov->kiov_page); + + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage++; + kiov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +int +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = iov->iov_len; + long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + + nmapped += npages; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", + ktx, nfrags, iov->iov_base, fraglen, basepage, npages, + nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + iov->iov_base, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage += npages; + iov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +void +kqswnal_put_idle_tx (kqswnal_tx_t *ktx) +{ + kpr_fwd_desc_t *fwd = NULL; + struct list_head *idle = ktx->ktx_idle; + unsigned long flags; + + kqswnal_unmap_tx (ktx); /* release temporary mappings */ + ktx->ktx_state = KTX_IDLE; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_add (&ktx->ktx_list, idle); + + /* reserved for non-blocking tx */ + if (idle == &kqswnal_data.kqn_nblk_idletxds) { + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return; + } + + /* anything blocking for a tx descriptor? */ + if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ + { + CDEBUG(D_NET,"wakeup fwd\n"); + + fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + } + + if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq)) /* process? */ + { + /* local sender waiting for tx desc */ + CDEBUG(D_NET,"wakeup process\n"); + wake_up (&kqswnal_data.kqn_idletxd_waitq); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + if (fwd == NULL) + return; + + /* schedule packet for forwarding again */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +kqswnal_tx_t * +kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) +{ + unsigned long flags; + kqswnal_tx_t *ktx = NULL; + + for (;;) { + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kqswnal_data.kqn_idletxds)) { + ktx = list_entry (kqswnal_data.kqn_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* "normal" descriptor pool is empty */ + + if (fwd != NULL) { /* forwarded packet => queue for idle txd */ + CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); + list_add_tail (&fwd->kprfd_list, + &kqswnal_data.kqn_idletxd_fwdq); + break; + } + + /* doing a local transmit */ + if (!may_block) { + if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { + CERROR ("intr tx desc pool exhausted\n"); + break; + } + + ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* block for idle tx */ + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + CDEBUG (D_NET, "blocking for tx desc\n"); + wait_event (kqswnal_data.kqn_idletxd_waitq, + !list_empty (&kqswnal_data.kqn_idletxds)); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ + LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); + return (ktx); +} + +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +{ + switch (ktx->ktx_state) { + case KTX_FORWARDING: /* router asked me to forward this packet */ + kpr_fwd_done (&kqswnal_data.kqn_router, + (kpr_fwd_desc_t *)ktx->ktx_args[0], error); + break; + + case KTX_SENDING: /* packet sourced locally */ + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + (lib_msg_t *)ktx->ktx_args[1]); + break; + + default: + LASSERT (0); + } + + kqswnal_put_idle_tx (ktx); +} + +static void +kqswnal_txhandler(EP_TXD *txd, void *arg, int status) +{ + kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + + LASSERT (txd != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); + + if (status == EP_SUCCESS) + atomic_inc (&kqswnal_packets_transmitted); + + if (status != EP_SUCCESS) + { + CERROR ("kqswnal: Transmit failed with %d\n", status); + status = -EIO; + } + + kqswnal_tx_done (ktx, status); +} + +int +kqswnal_launch (kqswnal_tx_t *ktx) +{ + /* Don't block for transmit descriptor if we're in interrupt context */ + int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; + int rc = ep_transmit_large(kqswnal_data.kqn_eptx, ktx->ktx_nid, + ktx->ktx_port, attr, kqswnal_txhandler, + ktx, ktx->ktx_iov, ktx->ktx_niov); + long flags; + + if (rc == 0) + atomic_inc (&kqswnal_packets_launched); + + if (rc != ENOMEM) + return (rc); + + /* can't allocate ep txd => queue for later */ + + LASSERT (in_interrupt()); /* not called by thread (not looping) */ + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + return (0); +} + + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + default: + return (""); + } +} + +static void +kqswnal_cerror_hdr(ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + CERROR("P3 Header at %p of type %s\n", hdr, type_str); + CERROR(" From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid), + NTOH__u32(hdr->src_pid)); + CERROR(" To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid), + NTOH__u32(hdr->dest_pid)); + + switch (NTOH__u32(hdr->type)) { + case PTL_MSG_PUT: + CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + NTOH__u64 (hdr->msg.put.match_bits)); + CERROR(" Length %d, offset %d, hdr data "LPX64"\n", + NTOH__u32(PTL_HDR_LENGTH(hdr)), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + CERROR(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.get.ptl_index), + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CERROR(" Length %d, src offset %d\n", + NTOH__u32 (hdr->msg.get.sink_length), + NTOH__u32 (hdr->msg.get.src_offset)); + break; + + case PTL_MSG_ACK: + CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + NTOH__u32 (hdr->msg.ack.mlength)); + break; + + case PTL_MSG_REPLY: + CERROR(" dst md "LPX64"."LPX64", length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + NTOH__u32 (PTL_HDR_LENGTH(hdr))); + } + +} /* end of print_hdr() */ + +static int +kqswnal_sendmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + kqswnal_tx_t *ktx; + int rc; + ptl_nid_t gatewaynid; +#if KQSW_CHECKSUM + int i; + kqsw_csum_t csum; + int sumnob; +#endif + + /* NB, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 + " pid %u\n", payload_nob, payload_niov, nid, pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + if (payload_nob > KQSW_MAXPAYLOAD) { + CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", + payload_nob, KQSW_MAXPAYLOAD); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + if (!kqswnal_ispeer (nid)) { /* Can't send direct: find gateway? */ + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + if (!kqswnal_ispeer (gatewaynid)) { + CERROR("Bad gateway "LPX64" for "LPX64"\n", + gatewaynid, nid); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + nid = gatewaynid; + } + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (ktx == NULL) { + kqswnal_cerror_hdr (hdr); + lib_finalize (&kqswnal_lib, private, cookie); + } + + memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + +#if KQSW_CHECKSUM + csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); + memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); + for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + if (payload_kiov != NULL) { + ptl_kiov_t *kiov = &payload_kiov[i]; + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset; + + csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); + sumnob -= kiov->kiov_len; + } else { + struct iovec *iov = &payload_iov[i]; + + csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); + sumnob -= iov->iov_len; + } + } + memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); +#endif + + /* Set up first frag from pre-mapped buffer (it's at least the + * portals header) */ + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; + ktx->ktx_iov[0].Len = KQSW_HDR_SIZE; + ktx->ktx_niov = 1; + + if (payload_nob > 0) { /* got some payload (something more to do) */ + /* make a single contiguous message? */ + if (payload_nob <= KQSW_TX_MAXCONTIG) { + /* copy payload to ktx_buffer, immediately after hdr */ + if (payload_kiov != NULL) + lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_kiov, payload_nob); + else + lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_iov, payload_nob); + /* first frag includes payload */ + ktx->ktx_iov[0].Len += payload_nob; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov (ktx, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov (ktx, payload_nob, + payload_niov, payload_iov); + if (rc != 0) { + kqswnal_put_idle_tx (ktx); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + } + } + + ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_SENDING; /* => lib_finalize() on completion */ + ktx->ktx_args[0] = private; + ktx->ktx_args[1] = cookie; + + rc = kqswnal_launch (ktx); + if (rc != 0) { /* failed? */ + CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + return (0); +} + +static int +kqswnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kqswnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + +int kqswnal_fwd_copy_contig = 0; + +void +kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + int rc; + kqswnal_tx_t *ktx; + struct iovec *iov = fwd->kprfd_iov; + int niov = fwd->kprfd_niov; + int nob = fwd->kprfd_nob; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + /* The router wants this NAL to forward a packet */ + CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + fwd, nid, niov, nob); + + LASSERT (niov > 0); + + ktx = kqswnal_get_idle_tx (fwd, FALSE); + if (ktx == NULL) /* can't get txd right now */ + return; /* fwd will be scheduled when tx desc freed */ + + if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + nid = fwd->kprfd_target_nid; /* target is final dest */ + + if (!kqswnal_ispeer (nid)) { + CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); + rc = -EHOSTUNREACH; + goto failed; + } + + if (nob > KQSW_NRXMSGBYTES_LARGE) { + CERROR ("Can't forward [%p] to "LPX64 + ": size %d bigger than max packet size %ld\n", + fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); + rc = -EMSGSIZE; + goto failed; + } + + if ((kqswnal_fwd_copy_contig || niov > 1) && + nob <= KQSW_TX_BUFFER_SIZE) + { + /* send from ktx's pre-allocated/mapped contiguous buffer? */ + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ + ktx->ktx_iov[0].Len = nob; + ktx->ktx_niov = 1; + } + else + { + /* zero copy */ + ktx->ktx_niov = 0; /* no frags mapped yet */ + rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + if (rc != 0) + goto failed; + } + + ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_FORWARDING; /* kpr_put_packet() on completion */ + ktx->ktx_args[0] = fwd; + + rc = kqswnal_launch (ktx); + if (rc == 0) + return; + + failed: + LASSERT (rc != 0); + CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); + + kqswnal_put_idle_tx (ktx); + /* complete now (with failure) */ + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; + + /* The router has finished forwarding this packet */ + + if (error != 0) + { + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); + } + + kqswnal_requeue_rx (krx); +} + +void +kqswnal_rx (kqswnal_rx_t *krx) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int nob; + int niov; + + if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ + /* NB krx requeued when lib_parse() calls back kqswnal_recv */ + lib_parse (&kqswnal_lib, hdr, krx); + return; + } + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + if (kqswnal_ispeer (dest_nid)) /* should have gone direct to peer */ + { + CERROR("dropping packet from "LPX64" for "LPX64 + ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); + kqswnal_requeue_rx (krx); + return; + } + + /* NB forwarding may destroy iov; rebuild every time */ + for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) + { + LASSERT (niov < krx->krx_npages); + krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); + krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + } + + kpr_fwd_init (&krx->krx_fwd, dest_nid, + krx->krx_nob, niov, krx->krx_iov, + kqswnal_fwd_callback, krx); + + kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); +} + +/* Receive Interrupt Handler: posts to schedulers */ +void +kqswnal_rxhandler(EP_RXD *rxd) +{ + long flags; + int nob = ep_rxd_len (rxd); + int status = ep_rxd_status (rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); + + CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", + rxd, krx, nob, status); + + LASSERT (krx != NULL); + + krx->krx_rxd = rxd; + krx->krx_nob = nob; + + /* must receive a whole header to be able to parse */ + if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) + { + /* receives complete with failure when receiver is removed */ + if (kqswnal_data.kqn_shuttingdown) + return; + + CERROR("receive status failed with status %d nob %d\n", + ep_rxd_status(rxd), nob); + kqswnal_requeue_rx (krx); + return; + } + + atomic_inc (&kqswnal_packets_received); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +#if KQSW_CHECKSUM +void +kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 + ", dpid %d, spid %d, type %d\n", + ishdr ? "Header" : "Payload", krx, + NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid) + NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid), + NTOH__u32(hdr->type)); + + switch (NTOH__u32 (hdr->type)) + { + case PTL_MSG_ACK: + CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 + " len %u\n", + NTOH__u32(hdr->msg.ack.mlength), + hdr->msg.ack.dst_wmd.handle_cookie, + hdr->msg.ack.dst_wmd.handle_idx, + NTOH__u64(hdr->msg.ack.match_bits), + NTOH__u32(hdr->msg.ack.length)); + break; + case PTL_MSG_PUT: + CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 + " len %u off %u data "LPX64"\n", + NTOH__u32(hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.handle_cookie, + hdr->msg.put.ack_wmd.handle_idx, + NTOH__u64(hdr->msg.put.match_bits), + NTOH__u32(hdr->msg.put.length), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + case PTL_MSG_GET: + CERROR ("GET: <>\n"); + break; + case PTL_MSG_REPLY: + CERROR ("REPLY: <>\n"); + break; + default: + CERROR ("TYPE?: <>\n"); + } +} +#endif + +static int +kqswnal_recvmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + int page; + char *page_ptr; + int page_nob; + char *iov_ptr; + int iov_nob; + int frag; +#if KQSW_CHECKSUM + kqsw_csum_t senders_csum; + kqsw_csum_t payload_csum = 0; + kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), + sizeof(ptl_hdr_t)); + size_t csum_len = mlen; + int csum_frags = 0; + int csum_nob = 0; + static atomic_t csum_counter; + int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; + + atomic_inc (&csum_counter); + + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + if (senders_csum != hdr_csum) + kqswnal_csum_error (krx, 1); +#endif + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + + /* What was actually received must be >= payload. + * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ + LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + LASSERT (mlen <= rlen); + + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + if (mlen != 0) + { + page = 0; + page_ptr = ((char *) page_address(krx->krx_pages[0])) + + KQSW_HDR_SIZE; + page_nob = PAGE_SIZE - KQSW_HDR_SIZE; + + LASSERT (niov > 0); + if (kiov != NULL) { + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + + for (;;) + { + /* We expect the iov to exactly match mlen */ + LASSERT (iov_nob <= mlen); + + frag = MIN (page_nob, iov_nob); + memcpy (iov_ptr, page_ptr, frag); +#if KQSW_CHECKSUM + payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); + csum_nob += frag; + csum_frags++; +#endif + mlen -= frag; + if (mlen == 0) + break; + + page_nob -= frag; + if (page_nob != 0) + page_ptr += frag; + else + { + page++; + LASSERT (page < krx->krx_npages); + page_ptr = page_address(krx->krx_pages[page]); + page_nob = PAGE_SIZE; + } + + iov_nob -= frag; + if (iov_nob != 0) + iov_ptr += frag; + else if (kiov != NULL) { + kunmap (kiov->kiov_page); + kiov++; + niov--; + LASSERT (niov > 0); + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov++; + niov--; + LASSERT (niov > 0); + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + } + + if (kiov != NULL) + kunmap (kiov->kiov_page); + } + +#if KQSW_CHECKSUM + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + + if (csum_len != rlen) + CERROR("Unable to checksum data in user's buffer\n"); + else if (senders_csum != payload_csum) + kqswnal_csum_error (krx, 0); + + if (csum_verbose) + CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " + "csum_nob %d\n", + hdr_csum, payload_csum, csum_frags, csum_nob); +#endif + lib_finalize(nal, private, cookie); + + kqswnal_requeue_rx (krx); + + return (rlen); +} + +static int +kqswnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + +static int +kqswnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + +int +kqswnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kqswnal_data.kqn_nthreads); + return (0); +} + +void +kqswnal_thread_fini (void) +{ + atomic_dec (&kqswnal_data.kqn_nthreads); +} + +int +kqswnal_scheduler (void *arg) +{ + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; + kpr_fwd_desc_t *fwd; + long flags; + int rc; + int counter = 0; + int did_something; + + kportal_daemonize ("kqswnal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + while (!kqswnal_data.kqn_shuttingdown) + { + did_something = FALSE; + + if (!list_empty (&kqswnal_data.kqn_readyrxds)) + { + krx = list_entry(kqswnal_data.kqn_readyrxds.next, + kqswnal_rx_t, krx_list); + list_del (&krx->krx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + kqswnal_rx (krx); + + did_something = TRUE; + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) + { + ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + rc = kqswnal_launch (ktx); + if (rc != 0) /* failed: ktx_nid down? */ + { + CERROR("Failed delayed transmit to "LPX64 + ": %d\n", ktx->ktx_nid, rc); + kqswnal_tx_done (ktx, rc); + } + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_fwd_packet (NULL, fwd); + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == KQSW_RESCHED) { + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, + kqswnal_data.kqn_shuttingdown || + !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_delayedtxds) || + !list_empty(&kqswnal_data.kqn_delayedfwds)); + LASSERT (rc == 0); + } else if (current->need_resched) + schedule (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_thread_fini (); + return (0); +} + +nal_cb_t kqswnal_lib = +{ + nal_data: &kqswnal_data, /* NAL private data */ + cb_send: kqswnal_send, + cb_send_pages: kqswnal_send_pages, + cb_recv: kqswnal_recv, + cb_recv_pages: kqswnal_recv_pages, + cb_read: kqswnal_read, + cb_write: kqswnal_write, + cb_malloc: kqswnal_malloc, + cb_free: kqswnal_free, + cb_printf: kqswnal_printf, + cb_cli: kqswnal_cli, + cb_sti: kqswnal_sti, + cb_dist: kqswnal_dist +}; diff --git a/lnet/klnds/scimaclnd/Makefile.am b/lnet/klnds/scimaclnd/Makefile.am new file mode 100644 index 0000000..6da31f0 --- /dev/null +++ b/lnet/klnds/scimaclnd/Makefile.am @@ -0,0 +1,11 @@ +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kscimacnal +modulenet_DATA = kscimacnal.o +EXTRA_PROGRAMS = kscimacnal + +DEFS = +kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h diff --git a/lnet/klnds/scimaclnd/README.scimacnal b/lnet/klnds/scimaclnd/README.scimacnal new file mode 100644 index 0000000..d4c6a49 --- /dev/null +++ b/lnet/klnds/scimaclnd/README.scimacnal @@ -0,0 +1,14 @@ + +scimacnal - A NAL for the Scali ScaMAC midlayer. + +The ScaMAC midlayer is a simplified API to the SCI high performance +interconnect. + +In order to use this NAL you'll need to tune scimac to use larger buffers. +See scimac.conf in this directory for an example. + +Overall performance and stability isn't great but this can be attributed +to the scimac driver which apparently is in need of some development. + +TODO: +Routing isn't yet implemented. diff --git a/lnet/klnds/scimaclnd/scimac.conf b/lnet/klnds/scimaclnd/scimac.conf new file mode 100644 index 0000000..bfb6d02 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimac.conf @@ -0,0 +1,35 @@ +# Configuration file for the scimac driver - lustre friendly settings +# + +# The maximal number of message headers to use in the system. +scimac_max_no_hdrs = 32 + +# The maximal number of eager buffers to use in the system. +scimac_max_no_ebufs = 8 + +# The maximal size in bytes of each eager buffer. +scimac_max_ebuf_size = 65536 + +# Enable use of a kernel thread to defer reception of packets. +# Default is to use a tasklet (sw interrupt). +scimac_use_ulevel_recv = 1 + +# The maximal number of packets queued for transfer per path at any one time. +scimac_max_send_queuelen = 2000 + +# The packet retransmit time in milliseconds. +# The time elapsed since a packet was attempted sent until the packet is resent. +scimac_pkt_rexmit_time = 200 + +# The packet's maximal retransmit time in milliseconds. +# The total time that a packet will be attempted sent before it is dropped. +scimac_max_rexmit_time = 5000 + +# The lowest valid node identifier in the system. +scimac_min_nodeid_number = 0x100 + +# The largest valid node identifier in the system. +scimac_max_nodeid_number = 0xff00 + +# The incremental nodeid step in the system. +scimac_nodeid_increment = 0x100 diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c new file mode 100644 index 0000000..1066d69 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal.c @@ -0,0 +1,219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * Based on gmnal, which is based on ksocknal and qswnal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#include "scimacnal.h" + +ptl_handle_ni_t kscimacnal_ni; +nal_t kscimacnal_api; + +kscimacnal_data_t kscimacnal_data; + +kpr_nal_interface_t kscimacnal_router_interface = { + kprni_nalid: SCIMACNAL, + kprni_arg: NULL, + kprni_fwd: kscimacnal_fwd_packet, +}; + + +static int kscimacnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */ + return PTL_OK; +} + + +static void kscimacnal_lock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + + +static void kscimacnal_unlock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + + +static int kscimacnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kscimacnal_api); + return 0; +} + + +static void kscimacnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kscimacnal_api); + + if (current->need_resched) + schedule(); + return; +} + + +static nal_t *kscimacnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + int nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */ + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids); + lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); + return &kscimacnal_api; +} + + +/* Called by kernel at module unload time */ +static void __exit +kscimacnal_finalize(void) +{ + /* FIXME: How should the shutdown procedure really look? */ + kscimacnal_data.ksci_shuttingdown=1; + + PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni); + + PtlNIFini(kscimacnal_ni); + lib_fini(&kscimacnal_lib); + + mac_finish(kscimacnal_data.ksci_machandle); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + + +/* Called by kernel at module insertion time */ +static int __init +kscimacnal_initialize(void) +{ + int rc; + unsigned long nid=0; + mac_handle_t *machandle = NULL; + + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kscimacnal_api.forward = kscimacnal_forward; + kscimacnal_api.shutdown = kscimacnal_shutdown; + kscimacnal_api.yield = kscimacnal_yield; + kscimacnal_api.validate = NULL; /* our api validate is a NOOP */ + kscimacnal_api.lock= kscimacnal_lock; + kscimacnal_api.unlock= kscimacnal_unlock; + kscimacnal_api.nal_data = &kscimacnal_data; + + kscimacnal_lib.nal_data = &kscimacnal_data; + + memset(&kscimacnal_data, 0, sizeof(kscimacnal_data)); + + kscimacnal_data.ksci_cb = &kscimacnal_lib; + + /* We're not using this, but cli/sti callbacks does... ??? */ + spin_lock_init(&kscimacnal_data.ksci_dispatch_lock); + + /* FIXME: We only support one adapter for now */ + machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx, + &kscimacnal_data); + + if(!machandle) { + CERROR("mac_init() failed\n"); + return -1; + } + + kscimacnal_data.ksci_machandle = machandle; + + /* Make sure the scimac MTU is tuned */ + if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) { + CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n", + mac_get_mtusize(machandle), SCIMACNAL_MTU); + CERROR("Consult README.scimacnal for more information\n"); + mac_finish(machandle); + return -1; + } + + /* Get the node ID */ + /* mac_get_physaddrlen() is a function instead of define, sigh */ + LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid)); + if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) { + CERROR("mac_get_physaddr() failed\n"); + mac_finish(machandle); + return -1; + } + nid = ntohl(nid); + kscimacnal_data.ksci_nid = nid; + + + /* Initialize Network Interface */ + /* FIXME: What do the magic numbers mean? Documentation anyone? */ + rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + mac_finish(machandle); + return (-ENOMEM); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); + + /* We're done now, it's OK for the RX callback to do stuff */ + kscimacnal_data.ksci_init = 1; + + return 0; +} + + +MODULE_AUTHOR("Niklas Edmundsson "); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_LICENSE("GPL"); + +module_init (kscimacnal_initialize); +module_exit (kscimacnal_finalize); + +EXPORT_SYMBOL(kscimacnal_ni); diff --git a/lnet/klnds/scimaclnd/scimacnal.h b/lnet/klnds/scimaclnd/scimacnal.h new file mode 100644 index 0000000..1ff180e --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal.h @@ -0,0 +1,85 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + */ + + +#ifndef _SCIMACNAL_H +#define _SCIMACNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For PAGE_SIZE */ + +#define DEBUG_SUBSYSTEM S_UNDEFINED + +#include +#include +#include + +#include + +#ifndef MAC_SAPID_LUSTRE +#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 +#endif /* MAC_SAPID_LUSTRE */ + +#define SCIMACNAL_MTU 65536 +/* FIXME: What is really the MTU of lustre? */ +#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#endif + +typedef struct { + mac_handle_t *handle; + mac_mblk_t *msg; + mac_msg_type_t type; + void *userdata; +} kscimacnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + ptl_hdr_t ktx_hdr; +} kscimacnal_tx_t; + + +typedef struct { + char ksci_init; + char ksci_shuttingdown; + ptl_nid_t ksci_nid; + nal_cb_t *ksci_cb; + spinlock_t ksci_dispatch_lock; + mac_handle_t *ksci_machandle; +} kscimacnal_data_t; + +extern kscimacnal_data_t kscimacnal_data; +extern nal_t kscimacnal_api; +extern nal_cb_t kscimacnal_lib; + +void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata); + + +#endif /* _SCIMACNAL_H */ diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c new file mode 100644 index 0000000..7e4a2e8 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal_cb.c @@ -0,0 +1,468 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "scimacnal.h" + +static int +kscimacnal_read (nal_cb_t *nal, void *private, + void *dst_addr, user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static int +kscimacnal_write(nal_cb_t *nal, void *private, + user_ptr dst_addr, void *src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static void * +kscimacnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + + +static void +kscimacnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + + +static void +kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kscimacnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->ksci_dispatch_lock,*flags); +} + + +static void +kscimacnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags); +} + + +static int +kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* FIXME: Network distance has a meaning, but is there no easy + * way to figure it out (depends on routing) */ + + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + + +static +char * get_mac_error(mac_status_t status) +{ + switch(status) { + case MAC_MSG_STAT_OK: + return "MAC_MSG_STAT_OK"; + case MAC_MSG_STAT_FREED: + return "MAC_MSG_STAT_FREED"; + case MAC_MSG_STAT_ABORTED: + return "MAC_MSG_STAT_ABORTED"; + case MAC_MSG_STAT_TIMEDOUT: + return "MAC_MSG_STAT_TIMEDOUT"; + case MAC_MSG_STAT_NODEUNREACH: + return "MAC_MSG_STAT_NODEUNREACH"; + case MAC_MSG_STAT_NETDOWN: + return "MAC_MSG_STAT_NETDOWN"; + case MAC_MSG_STAT_RESET: + return "MAC_MSG_STAT_RESET"; + case MAC_MSG_STAT_INITFAILED: + return "MAC_MSG_STAT_INITFAILED"; + case MAC_MSG_STAT_SYNCFAILED: + return "MAC_MSG_STAT_SYNCFAILED"; + case MAC_MSG_STAT_BADPROTO: + return "MAC_MSG_STAT_BADPROTO"; + case MAC_MSG_STAT_NOBUFSPACE: + return "MAC_MSG_STAT_NOBUFSPACE"; + case MAC_MSG_STAT_CONGESTION: + return "MAC_MSG_STAT_CONGESTION"; + case MAC_MSG_STAT_OTHER: + return "MAC_MSG_STAT_OTHER"; + default: + return "Unknown error"; + } +} + + +/* FIXME add routing code here ? */ + +/* Called by ScaMac when transmission is complete (ie. message is released) */ +static void +kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) +{ + kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; + int err=0; + + LASSERT (ktx != NULL); + + /* Euh, there is no feedback when transmission fails?! */ + switch(status) { + case MAC_MSG_STAT_OK: /* normal */ + break; + default: + CERROR("%s (%d):\n", get_mac_error(status), status); + err = -EIO; + break; + } + + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); +} + + +/* Called by portals when it wants to send a message. + * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ +static int +kscimacnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_len) +{ + kscimacnal_tx_t *ktx=NULL; + kscimacnal_data_t *ksci = nal->nal_data; + int rc=0; + int buf_len = sizeof(ptl_hdr_t) + payload_len; + mac_mblk_t *msg=NULL, *lastblk, *newblk; + unsigned long physaddr; + + + CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, nid, payload_niov); + + LASSERT(ksci != NULL); + + LASSERT(hdr != NULL); + + /* Do real check if we can send this */ + if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { + CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", + mac_get_mtusize(ksci->ksci_machandle)); + return -EINVAL; + } + + + /* save transaction info for later finalize and cleanup */ + PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); + if (!ktx) { + return -ENOMEM; + } + + /* *SIGH* hdr is a stack variable in the calling function, so we + * need to copy it to a buffer. Zerocopy magic (or is it just + * deferred memcpy?) is annoying sometimes. */ + memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t)); + + /* First, put the header in the main message mblk */ + msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t), + kscimacnal_txrelease, ktx); + if (!msg) { + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(msg, sizeof(ptl_hdr_t)); + lastblk=msg; + + /* Allocate additional mblks for each iov as needed. + * Essentially lib_copy_iov2buf with a twist or two */ + while (payload_len > 0) + { + ptl_size_t nob; + + LASSERT (payload_niov > 0); + + nob = MIN (payload_iov->iov_len, payload_len); + + /* We don't need a callback on the additional mblks, since + * all release callbacks seems to be called when the entire + * message has been sent */ + newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); + if(!newblk) { + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(newblk, nob); + mac_link_mblk(lastblk, newblk); + lastblk=newblk; + + payload_len -= nob; + payload_niov--; + payload_iov++; + } + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + + CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid); + + physaddr = htonl(nid); + + if((rc=mac_send(ksci->ksci_machandle, msg, + (mac_physaddr_t *) &physaddr))) { + CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return rc; + } + + return 0; +} + + +void +kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + + +/* Process a received portals packet */ +/* Called by the ScaMac RX thread when a packet is received */ +void +kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, + void *userdata) +{ + ptl_hdr_t *hdr = NULL; + kscimacnal_rx_t krx; + mac_size_t size; + kscimacnal_data_t *ksci = userdata; + + LASSERT(ksci != NULL); + + if ( !ksci->ksci_init || ksci->ksci_shuttingdown || + type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) { + /* We're not interested in messages not for us, ignore */ + mac_free_msg(msg); + return; + } + + size = mac_msg_size(msg); + + CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", + msg, type, size, mac_msg_mblks(msg)); + + if( size < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (ksci->ksci_shuttingdown) + return; + CERROR("kscimacnal: did not receive complete portal header," + "size= %ld\n", size); + /* Free the message before exiting */ + mac_free_msg(msg); + return; + } + + /* Provide everything we know */ + krx.handle = handle; + krx.msg = msg; + krx.type = type; + krx.userdata = userdata; + + /* mac_msg_next returns the next mblk with unread data */ + hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) ); + + if(!hdr) { + CERROR("kscimacnal: no data block in message %p\n", msg); + mac_free_msg(msg); + return; + } + + if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) { + PROF_START(lib_parse); + /* sets wanted_len, iovs etc and calls our callback */ + lib_parse(&kscimacnal_lib, hdr, &krx); + PROF_FINISH(lib_parse); +#if 0 /* FIXME: Is it possible to detect this? */ + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx:" + "target is a peer\n", + hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); +#endif /* if 0 FIXME */ + } else { + /* forward to gateway */ + CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n", + kscimacnal_lib.ni.nid, hdr->dest_nid); + } + + mac_free_msg(msg); + + CDEBUG(D_NET, "msg %p: Done\n", msg); +} + + +/* Called by portals to process a recieved packet */ +static int kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + kscimacnal_rx_t *krx = private; + mac_mblk_t *mblk; + void *src; + mac_size_t pkt_len; + ptl_size_t iovused=0; + + LASSERT (krx != NULL); + LASSERT (krx->msg != NULL); + + CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n", + krx->msg, mlen, rlen, niov); + + /* What was actually received must be >= what sender claims to have + * sent. This is an LASSERT, since lib-move doesn't check cb return + * code yet. Also, rlen seems to be negative when mlen==0 so don't + * assert on that. + */ + LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); + LASSERT (mlen==0 || mlen <= rlen); + + PROF_START(memcpy); + + /* mac_msg_next returns next mblk with unread data (ie. can + * be same mblk */ + while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) { + pkt_len = mac_mblk_len(mblk); + src = mac_get_mblk(mblk, pkt_len); /* Next unread block */ + + CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld src: %p\n", + krx->msg, mblk, pkt_len, src); + + LASSERT(src != NULL); + + /* Essentially lib_copy_buf2iov but with continuation support, + * we "gracefully" thrash the argument vars ;) */ + while (pkt_len > 0) { + ptl_size_t nob; + + LASSERT (niov > 0); + + LASSERT(iovused < iov->iov_len); + + nob = MIN (iov->iov_len-iovused, pkt_len); + CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + "iovused: %d\n", + iov->iov_base, iov->iov_len, + src, nob, iovused); + + memcpy (iov->iov_base+iovused, src, nob); + pkt_len -= nob; + src += nob; + + if(nob+iovused < iov->iov_len) { + /* We didn't use all of the iov */ + iovused+=nob; + } + else { + niov--; + iov++; + iovused=0; + } + } + } + PROF_FINISH(memcpy); + + CDEBUG(D_NET, "Calling lib_finalize.\n"); + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + CDEBUG(D_NET, "Done.\n"); + + return rlen; +} + + +nal_cb_t kscimacnal_lib = { + nal_data: &kscimacnal_data, /* NAL private data */ + cb_send: kscimacnal_send, + cb_send_pages: NULL, /* Ignore for now */ + cb_recv: kscimacnal_recv, + cb_recv_pages: NULL, + cb_read: kscimacnal_read, + cb_write: kscimacnal_write, + cb_malloc: kscimacnal_malloc, + cb_free: kscimacnal_free, + cb_printf: kscimacnal_printf, + cb_cli: kscimacnal_cli, + cb_sti: kscimacnal_sti, + cb_dist: kscimacnal_dist +}; diff --git a/lnet/klnds/socklnd/Makefile.am b/lnet/klnds/socklnd/Makefile.am new file mode 100644 index 0000000..437d7fc --- /dev/null +++ b/lnet/klnds/socklnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ksocknal +modulenet_DATA = ksocknal.o +EXTRA_PROGRAMS = ksocknal + +DEFS = +ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lnet/klnds/socklnd/Makefile.mk b/lnet/klnds/socklnd/Makefile.mk new file mode 100644 index 0000000..46edf01 --- /dev/null +++ b/lnet/klnds/socklnd/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Kernelenv + +obj-y += ksocknal.o +ksocknal-objs := socknal.o socknal_cb.o + diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 0000000..d15d8c8 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,863 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +ptl_handle_ni_t ksocknal_ni; +static nal_t ksocknal_api; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +ksock_nal_data_t ksocknal_data; +#else +static ksock_nal_data_t ksocknal_data; +#endif + +kpr_nal_interface_t ksocknal_router_interface = { + kprni_nalid: SOCKNAL, + kprni_arg: &ksocknal_data, + kprni_fwd: ksocknal_fwd_packet, +}; + + +int +ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ + return PTL_OK; +} + +int +ksocknal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ksocknal_close_sock(0); /* close all sockets */ +} + +void +ksocknal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ksocknal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ksocknal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ksocknal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ksocknal_data.ksnd_mynid); + lib_init(&ksocknal_lib, ksocknal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ksocknal_api); +} + +/* + * EXTRA functions follow + */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ksocknal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ksocknal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->nid); + + ksocknal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +void +ksocknal_bind_irq (unsigned int irq, int cpu) +{ +#if (defined(CONFIG_SMP) && CPU_AFFINITY) + char cmdline[64]; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + snprintf (cmdline, sizeof (cmdline), + "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + + printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); + + /* FIXME: Find a better method of setting IRQ affinity... + */ + + call_usermodehelper (argv[0], argv, envp); +#endif +} + +int +ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + ksock_sched_t *sched = NULL; + unsigned int irq = 0; + struct net_device *dev = NULL; + int ret; + int idx; + ENTRY; + + LASSERT (!in_interrupt()); + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->data_ready; + conn->ksnc_saved_write_space = sock->sk->write_space; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + +#warning check it is OK to derefence sk->dst_cache->dev like this... + lock_sock (conn->ksnc_sock->sk); + + if (conn->ksnc_sock->sk->dst_cache != NULL) { + dev = conn->ksnc_sock->sk->dst_cache->dev; + if (dev != NULL) { + irq = dev->irq; + if (irq >= NR_IRQS) { + CERROR ("Unexpected IRQ %x\n", irq); + irq = 0; + } + } + } + + release_sock (conn->ksnc_sock->sk); + + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (irq == 0 || + ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { + /* This is a software NIC, or we haven't associated it with + * a CPU yet */ + + /* Choose the CPU with the fewest connections */ + sched = ksocknal_data.ksnd_schedulers; + for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[idx].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[idx]; + + if (irq != 0) { /* Hardware NIC */ + /* Remember which scheduler we chose */ + idx = sched - ksocknal_data.ksnd_schedulers; + + LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + + if (bind_irq) /* remember if we will bind below */ + idx |= SOCKNAL_IRQ_BOUND; + + ksocknal_data.ksnd_irq_info[irq] = idx; + } + } else { + /* This is a hardware NIC, associated with a CPU */ + idx = ksocknal_data.ksnd_irq_info[irq]; + + /* Don't bind again if we've bound already */ + if ((idx & SOCKNAL_IRQ_BOUND) != 0) + bind_irq = 0; + + sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + } + + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (bind_irq && /* irq binding required */ + irq != 0) /* hardware NIC */ + ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + + /* NOW it's safe to get called back when socket is ready... */ + sock->sk->user_data = conn; + sock->sk->data_ready = ksocknal_data_ready; + sock->sk->write_space = ksocknal_write_space; + + /* ...which I call right now to get things going */ + ksocknal_data_ready (sock->sk, 0); + ksocknal_write_space (sock->sk); + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ksocknal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid == 0) { /* close ALL connections */ + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ksocknal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ksocknal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid && list_empty (&death_row)) + return (-ENOENT); + + while (!list_empty (&death_row)) { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + /* NB I _have_ to restore the callback, rather than storing + * a noop, since the socket could survive past this module + * being unloaded!! */ + conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space; + + /* OK; no more callbacks, but they could be in progress now, + * so wait for them to complete... */ + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + /* ...however if I get the lock before a callback gets it, + * this will make them noop + */ + conn->ksnc_sock->sk->user_data = NULL; + + /* And drop the scheduler's connection count while I've got + * the exclusive lock */ + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, + flags); + + ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ + } + + return (0); +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + return &(sk->tp_pinfo.af_tcp); +} +#else +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + struct tcp_sock *s = (struct tcp_sock *)sk; + return &s->tcp; +} +#endif + +void +ksocknal_push_conn (ksock_conn_t *conn) +{ + struct sock *sk = conn->ksnc_sock->sk; + struct tcp_opt *tp = sock2tcp_opt(sk); + int nonagle; + int val = 1; + int rc; + mm_segment_t oldmm; + + lock_sock (sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock (sk); + + oldmm = get_fs (); + set_fs (KERNEL_DS); + + rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof (val)); + LASSERT (rc == 0); + + set_fs (oldmm); + + lock_sock (sk); + tp->nonagle = nonagle; + release_sock (sk); +} + +/* Passing in a zero nid pushes all connections */ +int +ksocknal_push_sock (ptl_nid_t nid) +{ + ksock_conn_t *conn; + struct list_head *tmp; + int index; + int i; + + if (nid != 0) { + conn = ksocknal_get_conn (nid); + + if (conn == NULL) + return (-ENOENT); + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + + return (0); + } + + /* NB we can't remove connections from the socket list so we have to + * cope with them being removed from under us... + */ + for (index = 0; ; index++) { + read_lock (&ksocknal_data.ksnd_socklist_lock); + + i = 0; + conn = NULL; + + list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); // take a ref + break; + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + if (conn == NULL) + break; + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + } + + return (0); +} + +ksock_conn_t * +ksocknal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ksocknal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", + nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ksocknal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready); + LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space); + LASSERT (conn->ksnc_sock->sk->user_data == NULL); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) { + ksocknal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} + +int +ksocknal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, + data->ioc_flags); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ksocknal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ksocknal_set_mynid (data->ioc_nid); + break; + } + case NAL_CMD_PUSH_CONNECTION: { + rc = ksocknal_push_sock (data->ioc_nid); + break; + } + } + + return rc; +} + +void +ksocknal_free_buffers (void) +{ + if (ksocknal_data.ksnd_fmbs != NULL) { + ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; + i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); + i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ksocknal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ksocknal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ksocknal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + + SOCKNAL_NNBLK_LTXS)); + + if (ksocknal_data.ksnd_schedulers != NULL) + PORTAL_FREE (ksocknal_data.ksnd_schedulers, + sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); +} + +void __exit +ksocknal_module_fini (void) +{ + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(SOCKNAL); + PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ksocknal_ni); + lib_fini(&ksocknal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); + LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = + &ksocknal_data.ksnd_schedulers[i]; + + LASSERT (list_empty (&kss->kss_tx_conns)); + LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (kss->kss_nconns == 0); + } + + /* stop router calling me */ + kpr_shutdown (&ksocknal_data.ksnd_router); + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_reaper_waitq); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) + wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + + while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ksocknal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ksocknal_data.ksnd_router); + + ksocknal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + + +int __init +ksocknal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ksocknal_api.forward = ksocknal_api_forward; + ksocknal_api.shutdown = ksocknal_api_shutdown; + ksocknal_api.yield = ksocknal_api_yield; + ksocknal_api.validate = NULL; /* our api validate is a NOOP */ + ksocknal_api.lock = ksocknal_api_lock; + ksocknal_api.unlock = ksocknal_api_unlock; + ksocknal_api.nal_data = &ksocknal_data; + + ksocknal_lib.nal_data = &ksocknal_data; + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); + rwlock_init(&ksocknal_data.ksnd_socklist_lock); + + ksocknal_data.ksnd_nal_cb = &ksocknal_lib; + spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); + + spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, + sizeof (ksocknal_data.ksnd_irq_info)); + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + + PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); + if (ksocknal_data.ksnd_schedulers == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; + + spin_lock_init (&kss->kss_lock); + INIT_LIST_HEAD (&kss->kss_rx_conns); + INIT_LIST_HEAD (&kss->kss_tx_conns); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&kss->kss_zctxdone_list); +#endif + init_waitqueue_head (&kss->kss_waitq); + } + + CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, + sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS)); + if (ksocknal_data.ksnd_ltxs == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ksocknal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ksocknal_data.ksnd_idle_ltx_list : + &ksocknal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ksocknal_ni, ~0); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + rc = ksocknal_thread_start (ksocknal_scheduler, + &ksocknal_data.ksnd_schedulers[i]); + if (rc != 0) { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", + i, rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + + rc = ksocknal_thread_start (ksocknal_reaper, NULL); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ksocknal_data.ksnd_router, + &ksocknal_router_interface); + if (rc != 0) { + CDEBUG(D_NET, "Can't initialise routing interface " + "(rc = %d): not routing\n", rc); + } else { + /* Only allocate forwarding buffers if I'm on a gateway */ + + PORTAL_ALLOC(ksocknal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + if (ksocknal_data.ksnd_fmbs == NULL) { + ksocknal_module_fini (); + RETURN(-ENOMEM); + } + + /* NULL out buffer pointers etc */ + memset(ksocknal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS); i++) { + ksock_fmb_t *fmb = + &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; + } else { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) { + fmb->fmb_pages[j] = alloc_page (GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + LASSERT(page_address (fmb->fmb_pages[j]) != + NULL); + } + + list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + } + + rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + ksocknal_module_fini (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(ksocknal_ni); + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " + "mem %d)\n", + kpr_routing (&ksocknal_data.ksnd_router) ? + "enabled" : "disabled", pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); + +EXPORT_SYMBOL (ksocknal_ni); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 0000000..46ee3b7 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,293 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +typedef struct /* pool of forwarding buffers */ +{ + spinlock_t fmp_lock; /* serialise */ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + struct list_head kss_tx_conns; /* conn waiting to be written */ +#if SOCKNAL_ZC + struct list_head kss_zctxdone_list; /* completed ZC transmits */ +#endif + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + int kss_nconns; /* # connections assigned to this scheduler */ +} ksock_sched_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + ksock_sched_t *ksnd_schedulers; /* scheduler state */ + + kpr_router_t ksnd_router; /* THE router */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + spinlock_t ksnd_idle_ltx_lock; /* serialise ltx alloc/free */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ +#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ +#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded + * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 + * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 + * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t + * fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, if the message + * requires forwarding or will be received into mapped memory, up to + * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. + * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. + */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet iovec frags */ + struct iovec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + ptl_kiov_t *tx_kiov; /* packet page frags */ +#if SOCKNAL_ZC + ksock_sched_t *tx_sched; /* who to wake on callback */ + zccd_t tx_zccd; /* zero copy callback descriptor */ +#endif +} ksock_tx_t; + +#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the tx frag descriptors: hdr is always 1 iovec + * and payload is PTL_MD_MAX of either type. */ +typedef struct +{ + struct iovec hdr; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } payload; +} ksock_txiovspace_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + ksock_txiovspace_t ltx_iov_space; /* where to stash frag descriptors */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the address of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or PTL_MD_MAX_IOV frags of payload of either type. */ +typedef union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + volatile int ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct iovec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + volatile int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); +extern int ksocknal_close_sock(ptl_nid_t nid); +extern int ksocknal_set_mynid(ptl_nid_t nid); +extern int ksocknal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); +extern void _ksocknal_put_conn (ksock_conn_t *conn); +extern void ksocknal_close_conn (ksock_conn_t *conn); + +static inline void +ksocknal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ksocknal_put_conn (conn); +} + +extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); +extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ksocknal_scheduler (void *arg); +extern int ksocknal_reaper (void *arg); +extern void ksocknal_data_ready(struct sock *sk, int n); +extern void ksocknal_write_space(struct sock *sk); + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 0000000..388554d --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,1612 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +atomic_t ksocknal_packets_received; +atomic_t ksocknal_packets_launched; +atomic_t ksocknal_packets_being_sent; + +#if SOCKNAL_ZC +int ksocknal_do_zc = 1; +int ksocknal_zc_min_frag = 2048; +#endif + +/* + * LIB functions follow + * + */ +int +ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ksocknal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ksocknal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ksocknal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ksocknal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ksocknal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ksocknal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) { + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) { + if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock, + flags); + + wait_event (ksocknal_data.ksnd_idle_ltx_waitq, + !list_empty (&ksocknal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + return (ltx); +} + +#if SOCKNAL_ZC +struct page * +ksocknal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (page == NULL || + !VALID_PAGE (page)) + return (NULL); + + return (page); +} +#endif + +int +ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +{ + struct iovec *iov = tx->tx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; +#if SOCKNAL_ZC + int offset = vaddr & (PAGE_SIZE - 1); + int zcsize = MIN (fragsize, PAGE_SIZE - offset); + struct page *page; +#endif + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (tx->tx_niov > 0); + more |= (tx->tx_niov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + zcsize >= ksocknal_zc_min_frag && + (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { + + CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", + (void *)vaddr, page, page_address(page), offset, zcsize); + + more |= (zcsize < fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, zcsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + /* NB don't pass tx's iov; sendmsg may or may not update it */ + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_iov++; + tx->tx_niov--; + return (1); +} + +int +ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +{ + ptl_kiov_t *kiov = tx->tx_kiov; + int fragsize = kiov->kiov_len; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_nkiov > 0); + more |= (tx->tx_nkiov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + fragsize >= ksocknal_zc_min_frag) { + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, fragsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + char *addr = ((char *)kmap (page)) + offset; + struct iovec fragiov = {.iov_base = addr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + kunmap (page); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_kiov++; + tx->tx_nkiov--; + return (1); +} + +int +ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +{ + int rc; + int sent_some = 0; + ENTRY; + + LASSERT (!in_interrupt()); + + for (;;) { + if (tx->tx_niov != 0) + rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + else + rc = ksocknal_send_kiov (sock, tx, more); + + /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ + if (rc <= 0) /* error or partial send */ + RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); + + if (tx->tx_nob == 0) /* sent everything */ + RETURN (0); + + sent_some = 1; + } +} + +int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_iov++; + conn->ksnc_rx_niov--; + return (1); +} + +int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + unsigned long vaddr = ((unsigned long)kmap (page)) + offset; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + LASSERT (conn->ksnc_rx_nkiov > 0); + LASSERT (offset + fragsize <= PAGE_SIZE); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + kunmap (page); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_kiov++; + conn->ksnc_rx_nkiov--; + return (1); +} + +int +ksocknal_recvmsg (ksock_conn_t *conn) +{ + int rc; + int got_some = 0; + ENTRY; + + LASSERT (!in_interrupt ()); + + for (;;) { + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + /* CAVEAT EMPTOR: we return... + * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ + + if (rc <= 0) /* error/EOF or partial receive */ + RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); + + if (conn->ksnc_rx_nob_wanted == 0) + RETURN (1); + + got_some = 0; + } +} + +#if SOCKNAL_ZC +void +ksocknal_zc_callback (zccd_t *zcd) +{ + ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); + ksock_sched_t *sched = tx->tx_sched; + unsigned long flags; + ENTRY; + + /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); + EXIT; +} +#endif + +void +ksocknal_tx_done (ksock_tx_t *tx) +{ + long flags; + ksock_ltx_t *ltx; + ENTRY; + + atomic_dec (&ksocknal_packets_being_sent); + + if (tx->tx_isfwd) { /* was a forwarded packet? */ + kpr_fwd_done (&ksocknal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + EXIT; + return; + } + + /* local send */ + ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + EXIT; +} + +void +ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + LASSERT (!list_empty (&sched->kss_tx_conns)); + conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + /* assume transmit will complete now, so dequeue while I've got lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ksocknal_sendmsg (conn->ksnc_sock, tx, + !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc != 0) { +#warning FIXME: handle socket errors properly + CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + /* kid on for now the whole packet went. + * NB when we handle the error better, we'll still need to + * block for zccd completion. + */ + tx->tx_nob = 0; + } + + if (tx->tx_nob == 0) /* nothing left to send */ + { + /* everything went; assume more can go, so prevent write_space locking */ + conn->ksnc_tx_ready = 1; + + ksocknal_put_conn (conn); /* release packet's ref */ + atomic_inc (&ksocknal_packets_being_sent); +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + /* zccd skbufs are still in-flight. Release my + * initial ref on zccd, so callback can occur */ + zccd_put (&tx->tx_zccd); + } else +#endif + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + } else { + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); +} + +void +ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + unsigned long flags; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. + */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", + ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, + tx->tx_niov, tx->tx_nkiov); + +#if SOCKNAL_ZC + zccd_init (&tx->tx_zccd, ksocknal_zc_callback); + /* NB this sets 1 ref on zccd, so the callback can only occur + * after I've released this ref */ + tx->tx_sched = sched; +#endif + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + + atomic_inc (&ksocknal_packets_launched); +} + +ksock_conn_t * +ksocknal_send_target (ptl_nid_t nid) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + int rc; + + if ((conn = ksocknal_get_conn (nid)) == NULL) { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + return (NULL); + } + + if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { + CERROR ("Can't route to "LPX64": gateway "LPX64 + " is not a peer\n", nid, gatewaynid); + return (NULL); + } + } + + return (conn); +} + +ksock_ltx_t * +ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type) +{ + ksock_ltx_t *ltx; + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) { + CERROR ("Can't allocate tx desc\n"); + return (NULL); + } + + /* Init local send packet (storage for hdr, finalize() args) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + /* Init common ltx_tx */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr); + + /* We always have 1 mapped frag for the header */ + ltx->ltx_tx.tx_niov = 1; + ltx->ltx_tx.tx_iov = <x->ltx_iov_space.hdr; + ltx->ltx_tx.tx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + ltx->ltx_tx.tx_kiov = NULL; + ltx->ltx_tx.tx_nkiov = 0; + + return (ltx); +} + +int +ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it + * + * Also, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) { + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + /* append the payload_iovs to the one pointing at the header */ + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +int +ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) + return (-1); + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + return (-1); + } + + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov; + memcpy (ltx->ltx_tx.tx_kiov, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_nkiov = payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + /* I'm the gateway; must be the last hop */ + if (nid == ksocknal_lib.ni.nid) + nid = fwd->kprfd_target_nid; + + conn = ksocknal_get_conn (nid); + if (conn == NULL) { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + tx->tx_nkiov = 0; + tx->tx_kiov = NULL; + + ksocknal_launch_packet (conn, tx); +} + +int +ksocknal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ksocknal_data.ksnd_nthreads); + return (0); +} + +void +ksocknal_thread_fini (void) +{ + atomic_dec (&ksocknal_data.ksnd_nthreads); +} + +void +ksocknal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ksock_fmb_pool_t *fmp = fmb->fmb_pool; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn = NULL; + ksock_sched_t *sched; + long flags; + + if (error != 0) + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid), + error); + else + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", + NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + + spin_lock_irqsave (&fmp->fmp_lock, flags); + + list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); + + if (!list_empty (&fmp->fmp_blocked_conns)) { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + } + + spin_unlock_irqrestore (&fmp->fmp_lock, flags); + + if (conn == NULL) + return; + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); +} + +ksock_fmb_t * +ksocknal_get_idle_fmb (ksock_conn_t *conn) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + long flags; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (ksocknal_data.ksnd_fmbs != NULL); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; + + spin_lock_irqsave (&pool->fmp_lock, flags); + + if (!list_empty (&pool->fmp_idle_fmbs)) { + fmb = list_entry(pool->fmp_idle_fmbs.next, + ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + spin_unlock_irqrestore (&pool->fmp_lock, flags); + + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + + spin_unlock_irqrestore (&pool->fmp_lock, flags); + return (NULL); +} + + +int +ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + /* copy header */ + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + + if (payload_nob == 0) { /* got complete packet already */ + atomic_inc (&ksocknal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, 1, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* forward it now */ + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + } else { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = + page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, niov, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / + sizeof (struct iovec)); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = + (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = + fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], + (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); + return (0); +} + +void +ksocknal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + if (body_len < 0) { /* length corrupt (overflow) */ + CERROR("dropping packet from "LPX64" for "LPX64": packet " + "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, body_len); + ksocknal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (ksocknal_data.ksnd_fmbs == NULL) { /* not forwarding */ + CERROR("dropping packet from "LPX64" for "LPX64": not " + "forwarding\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) { /* too big to forward */ + CERROR ("dropping packet from "LPX64" for "LPX64 + ": packet size %d too big\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid, body_len); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + /* should have gone direct */ + conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); + if (conn2 != NULL) { + CERROR ("dropping packet from "LPX64" for "LPX64 + ": target is a peer\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + ksocknal_put_conn (conn2); /* drop ref from get above */ + + /* on to next packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + return (1); + } + + /* Set up to skip as much a possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_fmb_t *fmb; + int rc; + + /* NB: sched->ksnc_lock lock held */ + + LASSERT (!list_empty (&sched->kss_rx_conns)); + conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + CDEBUG(D_NET, "sched %p conn %p\n", sched, conn); + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* doesn't need a forwarding buffer */ + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) + goto try_read; + + get_fmb: + fmb = ksocknal_get_idle_fmb (conn); + if (fmb == NULL) { /* conn descheduled waiting for idle fmb */ + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + return; + } + + if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + rc = ksocknal_recvmsg(conn); + + if (rc == 0) + goto out; + if (rc < 0) { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read %p: %d\n", conn, rc); + goto out; + } + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + /* got all I wanted, assume there's more - prevent data_ready locking */ + conn->ksnc_rx_ready = 1; + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: + /* It's not for me */ + if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + ksocknal_fwd_parse (conn); + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + goto get_fmb; /* => go get a fwd msg buffer */ + default: + LBUG (); + } + /* Not Reached */ + } + + PROF_START(lib_parse); + /* sets wanted_len, iovs etc */ + lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ksocknal_packets_received); + /* packet is done now */ + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + NTOH__u64 (conn->ksnc_hdr.dest_nid), + conn->ksnc_rx_nob_left); + + atomic_inc (&ksocknal_packets_received); + + /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ + kpr_fwd_start (&ksocknal_data.ksnd_router, + (kpr_fwd_desc_t *)conn->ksnc_cookie); + + /* no slop in forwarded packets */ + LASSERT (conn->ksnc_rx_nob_left == 0); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* no data there to read? */ + if (!conn->ksnc_rx_ready) { + /* let socket callback schedule again */ + conn->ksnc_rx_scheduled = 0; + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); +} + +int +ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int +ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int ksocknal_scheduler (void *arg) +{ + ksock_sched_t *sched = (ksock_sched_t *)arg; + unsigned long flags; + int rc; + int nloops = 0; + int id = sched - ksocknal_data.ksnd_schedulers; + char name[16]; +#if (CONFIG_SMP && CPU_AFFINITY) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + int cpu = cpu_logical_map(id % num_online_cpus()); +#else +#warning "Take care of architecure specific logical APIC map" + int cpu = 1; /* Have to change later. */ +#endif /* LINUX_VERSION_CODE */ + + set_cpus_allowed (current, 1 << cpu); + id = cpu; +#endif /* CONFIG_SMP && CPU_AFFINITY */ + + snprintf (name, sizeof (name),"ksocknald[%d]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&sched->kss_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + did_something = 1; + /* drops & regains kss_lock */ + ksocknal_process_receive (sched, &flags); + } + + if (!list_empty (&sched->kss_tx_conns)) { + did_something = 1; + /* drops and regains kss_lock */ + ksocknal_process_transmit (sched, &flags); + } +#if SOCKNAL_ZC + if (!list_empty (&sched->kss_zctxdone_list)) { + ksock_tx_t *tx = + list_entry(sched->kss_zctxdone_list.next, + ksock_tx_t, tx_list); + did_something = 1; + + list_del (&tx->tx_list); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, flags); + } +#endif + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_irqrestore (&sched->kss_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ +#if SOCKNAL_ZC + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns) || + !list_empty(&sched->kss_zctxdone_list)); +#else + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns)); +#endif + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&sched->kss_lock, flags); + } + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + ksocknal_thread_fini (); + return (0); +} + +void +ksocknal_data_ready (struct sock *sk, int n) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + ENTRY; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->data_ready != &ksocknal_data_ready); + sk->data_ready (sk, n); + } else if (!conn->ksnc_rx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_rx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_receive may have cleared while I blocked for the lock) */ + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + EXIT; +} + +void +ksocknal_write_space (struct sock *sk) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->write_space != &ksocknal_write_space); + sk->write_space (sk); + } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + clear_bit (SOCK_NOSPACE, &sk->socket->flags); + + if (!conn->ksnc_tx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_tx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_transmit may have + cleared while I blocked for the lock) */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && // not being progressed + !list_empty(&conn->ksnc_tx_queue)){//packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); +} + +int +ksocknal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ksocknal_reaper"); + kportal_blockallsigs (); + + while (!ksocknal_data.ksnd_shuttingdown) { + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ksocknal_data.ksnd_reaper_list)) { + conn = NULL; + } else { + conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ksocknal_close_conn (conn); + else { + rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ksocknal_thread_fini (); + return (0); +} + +nal_cb_t ksocknal_lib = { + nal_data: &ksocknal_data, /* NAL private data */ + cb_send: ksocknal_send, + cb_send_pages: ksocknal_send_pages, + cb_recv: ksocknal_recv, + cb_recv_pages: ksocknal_recv_pages, + cb_read: ksocknal_read, + cb_write: ksocknal_write, + cb_callback: ksocknal_callback, + cb_malloc: ksocknal_malloc, + cb_free: ksocknal_free, + cb_printf: ksocknal_printf, + cb_cli: ksocknal_cli, + cb_sti: ksocknal_sti, + cb_dist: ksocknal_dist +}; diff --git a/lnet/klnds/toelnd/Makefile.am b/lnet/klnds/toelnd/Makefile.am new file mode 100644 index 0000000..9bfff64 --- /dev/null +++ b/lnet/klnds/toelnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ktoenal +modulenet_DATA = ktoenal.o +EXTRA_PROGRAMS = ktoenal + +DEFS = +ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h diff --git a/lnet/klnds/toelnd/toenal.c b/lnet/klnds/toelnd/toenal.c new file mode 100644 index 0000000..178ea41 --- /dev/null +++ b/lnet/klnds/toelnd/toenal.c @@ -0,0 +1,629 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include "toenal.h" + +ptl_handle_ni_t ktoenal_ni; +static nal_t ktoenal_api; +static ksock_nal_data_t ktoenal_data; + +/* +ksocknal_interface_t ktoenal_interface = { + ksni_add_sock: ktoenal_add_sock, + ksni_close_sock: ktoenal_close_sock, + ksni_set_mynid: ktoenal_set_mynid, +}; +*/ + +kpr_nal_interface_t ktoenal_router_interface = { + kprni_nalid: TOENAL, + kprni_arg: &ktoenal_data, + kprni_fwd: ktoenal_fwd_packet, +}; + + +int +ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */ + return PTL_OK; +} + +int +ktoenal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ktoenal_close_sock(0); /* close all sockets */ +} + +void +ktoenal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ktoenal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ktoenal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ktoenal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ktoenal_data.ksnd_mynid); + lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ktoenal_api); +} + +/* + * EXTRA functions follow + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ktoenal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ktoenal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid); + + ktoenal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +int +ktoenal_add_sock (ptl_nid_t nid, int fd) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + int ret; + ENTRY; + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + file->f_flags |= O_NONBLOCK; /* Does this have any conflicts */ + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ktoenal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist); + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + ktoenal_data_ready(conn); + ktoenal_write_space(conn); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + /* Schedule pollthread so that it will poll + * for newly created socket + */ + + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ktoenal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + if (nid == 0) /* close ALL connections */ + { + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ktoenal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ktoenal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + if (list_empty (&death_row)) + return (-ENOENT); + + do { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + ktoenal_put_conn (conn); /* drop ref for ksnd_socklist */ + } while (!list_empty (&death_row)); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + return (0); +} + + +ksock_conn_t * +ktoenal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ktoenal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ktoenal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) + { + ktoenal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list); + wake_up (&ktoenal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); +} + +void +ktoenal_free_buffers (void) +{ + if (ktoenal_data.ksnd_fmbs != NULL) + { + ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ktoenal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ktoenal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); +} + +int +ktoenal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ktoenal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ktoenal_set_mynid (data->ioc_nid); + break; + } + } + + return rc; +} + + +void __exit +ktoenal_module_fini (void) +{ + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ktoenal_data.ksnd_init) + { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(TOENAL); + PORTAL_SYMBOL_UNREGISTER (ktoenal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ktoenal_ni); + lib_fini(&ktoenal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ktoenal_data.ksnd_socklist)); + LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns)); + + kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */ + + /* flag threads to terminate; wake and wait for them to die */ + ktoenal_data.ksnd_shuttingdown = 1; + wake_up_all (&ktoenal_data.ksnd_reaper_waitq); + wake_up_all (&ktoenal_data.ksnd_sched_waitq); + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0) + { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ktoenal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ktoenal_data.ksnd_router); + + ktoenal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +int __init +ktoenal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ktoenal_api.forward = ktoenal_api_forward; + ktoenal_api.shutdown = ktoenal_api_shutdown; + ktoenal_api.yield = ktoenal_api_yield; + ktoenal_api.validate = NULL; /* our api validate is a NOOP */ + ktoenal_api.lock = ktoenal_api_lock; + ktoenal_api.unlock = ktoenal_api_unlock; + ktoenal_api.nal_data = &ktoenal_data; + + ktoenal_lib.nal_data = &ktoenal_data; + + memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist); + rwlock_init(&ktoenal_data.ksnd_socklist_lock); + + ktoenal_data.ksnd_nal_cb = &ktoenal_lib; + spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock); + + spin_lock_init (&ktoenal_data.ksnd_sched_lock); + + init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns); + INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list); + init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq); + spin_lock_init (&ktoenal_data.ksnd_reaper_lock); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */ + + PORTAL_ALLOC(ktoenal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + if (ktoenal_data.ksnd_fmbs == NULL) + RETURN(-ENOMEM); + + /* NULL out buffer pointers etc */ + memset(ktoenal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) + { + ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp; + } + else + { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) + { + fmb->fmb_pages[j] = alloc_page (GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + LASSERT (page_address (fmb->fmb_pages[j]) != NULL); + } + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + + PORTAL_ALLOC(ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + if (ktoenal_data.ksnd_ltxs == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ktoenal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) + { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ktoenal_data.ksnd_idle_ltx_list : + &ktoenal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni); + if (rc != 0) + { + CERROR("ktoenal: PtlNIInit failed: error %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ktoenal_ni, ~0); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */ + + ktoenal_data.ksnd_slistchange = 1; + for (i = 0; i < TOENAL_N_SCHED; i++) + { + rc = ktoenal_thread_start (ktoenal_scheduler, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); + ktoenal_module_fini (); + RETURN (rc); + } + } + + rc = ktoenal_thread_start (ktoenal_reaper, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = ktoenal_thread_start (ktoenal_pollthread, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal pollthread: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ktoenal_data.ksnd_router, + &ktoenal_router_interface); + if (rc != 0) + CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc); + + rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL); + if (rc != 0) + CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n", + rc); + + PORTAL_SYMBOL_REGISTER(ktoenal_ni); + + /* flag everything initialised */ + ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n", + kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ktoenal_module_init); +module_exit(ktoenal_module_fini); + +EXPORT_SYMBOL (ktoenal_ni); diff --git a/lnet/klnds/toelnd/toenal.h b/lnet/klnds/toelnd/toenal.h new file mode 100644 index 0000000..f793d3b --- /dev/null +++ b/lnet/klnds/toelnd/toenal.h @@ -0,0 +1,236 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 32 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +#define TOENAL_N_SCHED 1 + +typedef struct /* pool of forwarding buffers */ +{ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + + kpr_router_t ksnd_router; /* THE router */ + + spinlock_t ksnd_sched_lock; /* serialise packet scheduling */ + wait_queue_head_t ksnd_sched_waitq; /* where scheduler(s) wait */ + + struct list_head ksnd_rx_conns; /* conn waiting to be read */ + struct list_head ksnd_tx_conns; /* conn waiting to be written */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */ + poll_table ksnd_pwait; /* poll wait table for the socket */ + int ksnd_slistchange; /* informs the pollthread that + * the socklist has changed */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet frags */ + struct iovec *tx_iov; /* packet frags */ +} ksock_tx_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + struct iovec ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the addres of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* socket */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # frags */ + struct iovec ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */ + + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + unsigned long ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ktoenal_add_sock (ptl_nid_t nid, int fd); +extern int ktoenal_close_sock(ptl_nid_t nid); +extern int ktoenal_set_mynid(ptl_nid_t nid); +extern int ktoenal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid); +extern void _ktoenal_put_conn (ksock_conn_t *conn); +extern void ktoenal_close_conn (ksock_conn_t *conn); + +static inline void +ktoenal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ktoenal_put_conn (conn); +} + +extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg); +extern int ktoenal_new_packet (ksock_conn_t *conn, int skip); +extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ktoenal_scheduler (void *arg); +extern int ktoenal_reaper (void *arg); +extern int ktoenal_pollthread (void *arg); +extern void ktoenal_data_ready(ksock_conn_t *conn); +extern void ktoenal_write_space(ksock_conn_t *conn); + + +extern nal_cb_t ktoenal_lib; +extern ksock_nal_data_t ktoenal_data; diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c new file mode 100644 index 0000000..8270196 --- /dev/null +++ b/lnet/klnds/toelnd/toenal_cb.c @@ -0,0 +1,1220 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include "toenal.h" + +atomic_t ktoenal_packets_received; +long ktoenal_packets_launched; +long ktoenal_packets_transmitted; + +/* + * LIB functions follow + * + */ +int +ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ktoenal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ktoenal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ktoenal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ktoenal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ktoenal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ktoenal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ktoenal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) + { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) + { + if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + wait_event (ktoenal_data.ksnd_idle_ltx_waitq, + !list_empty (&ktoenal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + return (ltx); +} + +int +ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags) +{ + /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't) + */ + mm_segment_t oldmm; + int rc; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + oldmm = get_fs(); + set_fs (KERNEL_DS); + +#ifdef PORTAL_DEBUG + { + int total_nob; + int i; + + for (i = total_nob = 0; i < niov; i++) + total_nob += iov[i].iov_len; + + LASSERT (nob == total_nob); + } +#endif + LASSERT (!in_interrupt()); + + rc = sock->f_op->writev(sock, iov, niov, NULL); + + set_fs (oldmm); + + if (rc > 0) /* sent something? */ + { + nob = rc; /* consume iov */ + for (;;) + { + LASSERT (niov > 0); + + if (iov->iov_len >= nob) + { + iov->iov_len -= nob; + iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); + break; + } + nob -= iov->iov_len; + iov->iov_len = 0; + iov++; + niov--; + } + } + + return (rc); +} + +int +ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread) +{ + /* NB This procedure "consumes" iov (actually tcp_recvmsg does) + */ + mm_segment_t oldmm; + int ret, i, len = 0, origlen = 0; + + PROF_START(our_recvmsg); + for(i = 0; i < niov; i++) { + len += iov[i].iov_len; + if(len >= toread) + break; + } + + if(len >= toread) { + origlen = iov[i].iov_len; + iov[i].iov_len -= (len - toread); + } + else { /* i == niov */ + i = niov - 1; + } + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + ret = sock->f_op->readv(sock, iov, i + 1, NULL); + + set_fs(oldmm); + + if(origlen) + iov[i].iov_len = origlen; + + PROF_FINISH(our_recvmsg); + return ret; +} + +void +ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags) +{ + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + int rc; + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + + /* assume transmit will complete now, so dequeue while I've got the lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0; /* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ktoenal_sendmsg (conn->ksnc_file, + tx->tx_iov, tx->tx_niov, tx->tx_nob, + list_empty (&conn->ksnc_tx_queue) ? + MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE)); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc < 0) /* error */ + { + if (rc == -EAGAIN) /* socket full => */ + rc = 0; /* nothing sent */ + else + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + rc = tx->tx_nob; /* kid on for now whole packet went */ + } + } + + if (rc == tx->tx_nob) /* everything went */ + { + conn->ksnc_tx_ready = 1; /* assume more can go (ASAP) */ + ktoenal_put_conn (conn); /* release packet's ref */ + + if (tx->tx_isfwd) /* was a forwarded packet? */ + { + kpr_fwd_done (&ktoenal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + } + else /* local send */ + { + ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list && + waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq)) + wake_up (&ktoenal_data.ksnd_idle_ltx_waitq); + } + ktoenal_packets_transmitted++; + } + else + { + tx->tx_nob -= rc; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) /* nothing to write */ + { + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); +} + +void +ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + long flags; + int nob = tx->tx_nob; + struct iovec *iov = tx->tx_iov; + int niov = 1; + + LASSERT (nob >= sizeof (ptl_hdr_t)); + + /* Truncate iov to exactly match total packet length + * since socket sendmsg pays no attention to requested length. + */ + for (;;) + { + LASSERT (niov <= tx->tx_niov); + LASSERT (iov->iov_len >= 0); + + if (iov->iov_len >= nob) + { + iov->iov_len = nob; + break; + } + nob -= iov->iov_len; + iov++; + niov++; + } + tx->tx_niov = niov; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) /* not scheduled to send */ + { + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + ktoenal_packets_launched++; + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +int +ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + ksock_ltx_t *ltx; + int rc; + int i; + + /* By this point, as it happens, we have absolutely no idea what + * 'private' is. It might be ksock_nal_data or it might be ksock_conn. + * Ha ha, isn't that a funny joke? + * + * FIXME: this is not the right way to fix this; the right way is to + * always pass in the same kind of structure. This is hard right now. + * To revisit this issue, set a breakpoint in here and watch for when + * it's called from lib_finalize. I think this occurs when we send a + * packet as a side-effect of another packet, such as when an ACK has + * been requested. -phil */ + + CDEBUG(D_NET, "sending "LPSZ" bytes from [%d](%p,%d)... to nid: "LPX64" pid %d\n", + payload_len, payload_niov, + payload_niov > 0 ? payload_iov[0].iov_base : NULL, + payload_niov > 0 ? payload_iov[0].iov_len : 0, + nid, pid); + + if ((conn = ktoenal_get_conn (nid)) == NULL) + { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) + { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (-1); + } + + if ((conn = ktoenal_get_conn (gatewaynid)) == NULL) + { + CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", + nid, gatewaynid); + return (-1); + } + } + + /* This transmit has now got a ref on conn */ + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) + { + CERROR ("Can't allocate tx desc\n"); + ktoenal_put_conn (conn); + return (-1); + } + + /* Init common (to sends and forwards) packet part */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_iov = ltx->ltx_iov; + + /* Init local send packet (storage for hdr, finalize() args, iov) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + ltx->ltx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + for (i = 0; i < payload_niov; i++) + { + ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base; + ltx->ltx_iov[1 + i].iov_len = payload_iov[i].iov_len; + } + + ktoenal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + if (nid == ktoenal_lib.ni.nid) /* I'm the gateway; must be the last hop */ + nid = fwd->kprfd_target_nid; + + conn = ktoenal_get_conn (nid); + if (conn == NULL) + { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + + ktoenal_launch_packet (conn, tx); +} + +int +ktoenal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ktoenal_data.ksnd_nthreads); + return (0); +} + +void +ktoenal_thread_fini (void) +{ + atomic_dec (&ktoenal_data.ksnd_nthreads); +} + +void +ktoenal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn; + long flags; + + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + if (error != 0) + CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + + if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns)) + { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +ksock_fmb_t * +ktoenal_get_idle_fmb (ksock_conn_t *conn) +{ + /* NB called with sched lock held */ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ktoenal_data.ksnd_small_fmp; + else + pool = &ktoenal_data.ksnd_large_fmp; + + if (!list_empty (&pool->fmp_idle_fmbs)) + { + fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + return (NULL); +} + + +int +ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */ + + if (payload_nob == 0) /* got complete packet already */ + { + atomic_inc (&ktoenal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, 1, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + else + { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do + { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, niov, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob); + return (0); +} + +void +ktoenal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + int body_len; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + switch (conn->ksnc_hdr.type) + { + case PTL_MSG_GET: + case PTL_MSG_ACK: + body_len = 0; + break; + case PTL_MSG_PUT: + body_len = conn->ksnc_hdr.msg.put.length; + break; + case PTL_MSG_REPLY: + body_len = conn->ksnc_hdr.msg.reply.length; + break; + default: + /* Unrecognised packet type */ + CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n", + conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + /* Ignore this header and go back to reading a new packet. */ + ktoenal_new_packet (conn, 0); + return; + } + + if (body_len < 0) /* length corrupt */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) /* too big to forward */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, body_len); /* on to new packet (skip this one's body) */ + return; + } + + conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */ + if (conn2 != NULL) + { + CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + ktoenal_put_conn (conn2); /* drop ref from get above */ + + ktoenal_new_packet (conn, body_len); /* on to next packet (skip this one's body) */ + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ktoenal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) /* right at next packet boundary now */ + { + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + return (1); + } + + /* set up to skip as much a possible now */ + /* if there's more left (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + skipped = 0; + niov = 0; + + do + { + nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags) +{ + ksock_fmb_t *fmb; + int len; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* NB: sched lock held */ + CDEBUG(D_NET, "conn %p\n", conn); + + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) /* doesn't need a forwarding buffer */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto try_read; + } + + get_fmb: + /* NB: sched lock held */ + fmb = ktoenal_get_idle_fmb (conn); + if (fmb == NULL) /* conn descheduled waiting for idle fmb */ + return; + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0; /* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + /* NB ktoenal_recvmsg "consumes" the iov passed to it */ + len = ktoenal_recvmsg(conn->ksnc_file, + conn->ksnc_rx_iov, conn->ksnc_rx_niov, + conn->ksnc_rx_nob_wanted); + CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len); + + if (len <= 0) /* nothing ready (EAGAIN) or EOF or error */ + { + if (len != -EAGAIN && /* ! nothing to read now */ + len != 0) /* ! nothing to read ever */ + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read(%d) %p: %d\n", + conn->ksnc_rx_nob_wanted, conn, len); + } + goto out; /* come back when there's data ready */ + } + + LASSERT (len <= conn->ksnc_rx_nob_wanted); + conn->ksnc_rx_nob_wanted -= len; + conn->ksnc_rx_nob_left -= len; + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + conn->ksnc_rx_ready = 1; /* assume there's more to be had */ + + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: + if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */ + { + ktoenal_fwd_parse (conn); + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping this packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto get_fmb; /* => go get a fwd msg buffer */ + default: + } + /* Not Reached */ + LBUG (); + } + + PROF_START(lib_parse); + lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */ + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */ + { + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ktoenal_packets_received); + lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */ + /* Fall through */ + + case SOCKNAL_RX_SLOP: + if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */ + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + atomic_inc (&ktoenal_packets_received); + + /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */ + kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie); + + LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (!conn->ksnc_rx_ready) /* no data there to read? */ + { + conn->ksnc_rx_scheduled = 0; /* let socket callback schedule again */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); +} + +int +ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + int i; + + conn->ksnc_cookie = msg; + + LASSERT (niov <= PTL_MD_MAX_IOV); + for (i = 0; i < niov; i++) + { + conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len; + conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base; + } + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + return (rlen); +} + +int +ktoenal_scheduler (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + int nloops = 0; + + kportal_daemonize ("ktoenal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + while (!ktoenal_data.ksnd_shuttingdown) + { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&ktoenal_data.ksnd_rx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */ + } + + if (!list_empty (&ktoenal_data.ksnd_tx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + + list_del (&conn->ksnc_tx_list); + ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */ + } + + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty (&ktoenal_data.ksnd_rx_conns) || + !list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + ktoenal_thread_fini (); + return (0); +} + + +int +ktoenal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ktoenal_reaper"); + kportal_blockallsigs (); + + while (!ktoenal_data.ksnd_shuttingdown) + { + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ktoenal_data.ksnd_reaper_list)) + conn = NULL; + else + { + conn = list_entry (ktoenal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ktoenal_close_conn (conn); + else { + rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty(&ktoenal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ktoenal_thread_fini (); + return (0); +} + +#define POLLREAD (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI) +#define POLLWRITE (POLLOUT | POLLWRNORM | POLLWRBAND) + +int +ktoenal_pollthread(void *arg) +{ + unsigned int mask; + struct list_head *tmp; + ksock_conn_t *conn; + + /* Save the task struct for waking it up */ + ktoenal_data.ksnd_pollthread_tsk = current; + + kportal_daemonize ("ktoenal_pollthread"); + kportal_blockallsigs (); + + poll_initwait(&ktoenal_data.ksnd_pwait); + + while(!ktoenal_data.ksnd_shuttingdown) { + + set_current_state(TASK_INTERRUPTIBLE); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc(&conn->ksnc_refcount); + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + mask = conn->ksnc_file->f_op->poll(conn->ksnc_file, + ktoenal_data.ksnd_slistchange ? + &ktoenal_data.ksnd_pwait : NULL); + + if(mask & POLLREAD) { + ktoenal_data_ready(conn); + + } + if (mask & POLLWRITE) { + ktoenal_write_space(conn); + + } + if (mask & (POLLERR | POLLHUP)) { + /* Do error processing */ + } + + read_lock (&ktoenal_data.ksnd_socklist_lock); + if(atomic_dec_and_test(&conn->ksnc_refcount)) + _ktoenal_put_conn(conn); + } + ktoenal_data.ksnd_slistchange = 0; + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + if(ktoenal_data.ksnd_slistchange) { + poll_freewait(&ktoenal_data.ksnd_pwait); + poll_initwait(&ktoenal_data.ksnd_pwait); + } + } + poll_freewait(&ktoenal_data.ksnd_pwait); + ktoenal_thread_fini(); + return (0); +} + +void +ktoenal_data_ready (ksock_conn_t *conn) +{ + unsigned long flags; + ENTRY; + + if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail (&conn->ksnc_rx_list, + &ktoenal_data.ksnd_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + /* This is done to avoid the effects of a sequence + * of events in which the rx_ready is lost + */ + conn->ksnc_rx_ready=1; + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } + + EXIT; +} + +void +ktoenal_write_space (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "conn %p%s%s%s\n", + conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued")); + + + if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */ + !conn->ksnc_tx_scheduled) { /* not being progressed */ + + list_add_tail (&conn->ksnc_tx_list, + &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } +} + +nal_cb_t ktoenal_lib = { + nal_data: &ktoenal_data, /* NAL private data */ + cb_send: ktoenal_send, + cb_recv: ktoenal_recv, + cb_read: ktoenal_read, + cb_write: ktoenal_write, + cb_callback: ktoenal_callback, + cb_malloc: ktoenal_malloc, + cb_free: ktoenal_free, + cb_printf: ktoenal_printf, + cb_cli: ktoenal_cli, + cb_sti: ktoenal_sti, + cb_dist: ktoenal_dist +}; diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am new file mode 100644 index 0000000..e2e11af --- /dev/null +++ b/lnet/libcfs/Makefile.am @@ -0,0 +1,29 @@ +# Copyright (C) 2001, 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +MODULE = portals +modulenet_DATA = portals.o +EXTRA_PROGRAMS = portals + +LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-not-impl.c lib-pid.c +APILINKS := api-eq.c api-errno.c api-init.c api-md.c api-me.c api-ni.c api-wrap.c +LINKS = $(APILINKS) $(LIBLINKS) +DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej + +$(LINKS): link-stamp +link-stamp: + -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + echo timestamp > link-stamp + +DEFS = +portals_SOURCES = $(LINKS) module.c proc.c debug.c + +# Don't distribute any patched files. +dist-hook: + list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done + +include ../Rules.linux diff --git a/lnet/libcfs/Makefile.mk b/lnet/libcfs/Makefile.mk new file mode 100644 index 0000000..3196ea2 --- /dev/null +++ b/lnet/libcfs/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += libcfs.o +licfs-objs := module.o proc.o debug.o \ No newline at end of file diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c new file mode 100644 index 0000000..6233b8d --- /dev/null +++ b/lnet/libcfs/debug.c @@ -0,0 +1,821 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include + +#define DEBUG_OVERFLOW 1024 +static char *debug_buf = NULL; +static unsigned long debug_size = 0; +static atomic_t debug_off_a = ATOMIC_INIT(0); +static int debug_wrapped; +wait_queue_head_t debug_ctlwq; +#define DAEMON_SND_SIZE (64 << 10) + +/* + * used by the daemon to keep track the offset into debug_buffer for the next + * write to the file. Usually, the daemon is to write out buffer + * from debug_daemon_next_write upto debug_off + * variable usage + * Reader - portals_debug_msg() + * Writer - portals_debug_daemon() + * portals_debug_daemon_start() during daemon init time + * portals_debug_daemon_continue() to reset to debug_off + * portals_debug_clear_buffer() reset to debug_off for clear + * Note that *_start(), *_continue() & *clear_buffer() should serialized; + */ +static atomic_t debug_daemon_next_write; + +/* + * A debug_daemon can be in following states + * stopped - stopped state means there is no debug_daemon running. + * accordingly, it must be in paused state + * a daemon is in !stopped && !paused state after + * "lctl debug_daemon start" creates debug_daemon successfully + * Variable Usage + * Reader - portals_debug_daemon() + * portals_debug_set_daemon() routines + * Writer - portals_debug_set_daemon() routines + * portals_debug_daemon() on IO error + * paused - a debug_daemon state is changed from !paused into paused + * when "lctl debug_daemon paused" is issued + * "lctl debug_daemon continue" gets a daemon into !paused mode + * Reader - portals_debug_set_daemon() routines + * portals_debug_msg() + * Writer - portals_debug_set_daemon() on init + * portals_debug_daemon() + * + * Daemon state diagram. + * (stopped, paused) + * | <-- debug_daemon start + * V + * (!stopped, !paused) + * | <-- debug_daemon pause + * V + * (!stopped, paused) + * | <-- debug_daemon continue + * V + * (!stopped, !paused) + * | <-- debug_daemon stop + * V + * (stopped, paused) + * Overlapped - this is a state when CDEBUG is too fast for the daemon to + * write out the debug_bufferr. That is, debug_off is to + * overlap debug_daemon_next_write; + * Reader - portals_debug_msg() + * Writer - portals_debug_msg() + */ + +/* + * Description on Trace Daemon Synchronization + * + * Three categories of code are synchronizing between each other + * 1. lctl, portals_debug_set_daemon(), the user debug control code, + * as well as portals_debug_clear_buffer() + * 2. CDEBUG, portals_debug_msg(), the debug put messages routine + * 3. Daemon, portals_debug_daemon(), to write out debug log file + * + * + * Three different controls for synchronizations + * + * 1. debug_daemon_semaphore + * The usage of this semaphore is to serialize multiple lctl controls + * in manipulating debug daemon state. The semaphore serves as the + * gatekeeper to allow only one user control thread, at any giving time, + * to access debug daemon state and keeps the other user control requests + * in wait state until the current control request is serviced. + * + * 2. wait_queue_head_t lctl (paired with lctl_event flag) + * Lctl event is the event between portals_debug_set_daemon() and + * portals_debug_daemon(). Lctl is an indicator for portals_debug_daemon() + * to flush data out to file. portals_debug_daemon() is to use lctl event + * as signal channel to wakeup portals_debug_set_daemon() upon flush + * operation is done. + * + * Producer : + * portals_debug_daemon() uses to wake up + * portals_debug_set_daemon(), pause and stop, routines + * Consumer : + * portals_debug_set_daemon(), stop and pause operations, + * wait and sleep on the event + * + * 3. wait_queue_head_t daemon (paired with daemon_event flag) + * This is an event channel to wakeup portals_debug_daemon. Daemon + * wakes up to run whenever there is an event posted. Daemon handles + * 2 types of operations . 1. Writes data out to debug file, 2. Flushes + * file and terminates base on lctl event. + * File operation - + * Daemon is normally in a sleep state. + * Daemon is woken up through daemon event whenever CDEBUG is + * putting data over any 64K boundary. + * File flush and termination - + * On portals_debug_daemon_stop/pause() operations, lctl control + * is to wake up daemon through daemon event. + * + * We can't use sleep_on() and wake_up() to replace daemon event because + * portals_debug_daemon() must catch the wakeup operation posted by + * portals_debug_daemon_stop/pause(). Otherwise, stop and pause may + * stuck in lctl wait event. + * + * Producer : + * a. portals_debug_daemon_pause() and portals_debug_daemon_stop() + * uses the event to wake up portals_debug_daemon() + * b. portals_debug_msg() uses the event to wake up + * portals_debug_daemon() whenever the data output is acrossing + * a 64K bytes boundary. + * Consumer : + * portals_debug_daemon() wakes up upon daemon event. + * + * Sequence for portals_debug_daemon_stop() operation + * + * _Portals_debug_daemon_stop()_ _Daemon_ + * Wait_event(daemon) or running + * Paused = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Set force_flush flag if lctlevnt + * Flush data + * Wakeup_event (lctl) + * Wait_event(daemon) + * Stopped = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Exit daemon loop if (Stopped) + * Wakeup_event (lctl) + * Exit + * Return to user application + * + * + * _Portals_debug_msg()_ _Daemon_ + * Wait_event(daemon) or running + * If (WriteStart<64Kjournal_info; + current->journal_info = NULL; + sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME); + file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for dumping", debug_file_name); + GOTO(out, PTR_ERR(file)); + } else { + printk(KERN_ALERT "dumping log to %s ... writing ...\n", + debug_file_name); + } + + debug_off = atomic_read(&debug_off_a); + oldfs = get_fs(); + set_fs(get_ds()); + if (debug_wrapped) { + rc = file->f_op->write(file, debug_buf + debug_off + 1, + debug_size-debug_off-1, &file->f_pos); + rc += file->f_op->write(file, debug_buf, debug_off + 1, + &file->f_pos); + } else { + rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos); + } + printk("wrote %d bytes\n", rc); + set_fs(oldfs); + + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc) + CERROR("sync returns %d\n", rc); + filp_close(file, 0); +out: + current->journal_info = journal_info; + wake_up(&debug_ctlwq); + return 0; +} + +int portals_debug_daemon(void *arg) +{ + struct file *file; + void *journal_info; + mm_segment_t oldfs; + unsigned long force_flush = 0; + unsigned long size; + int rc; + + kportal_daemonize("ldebug_daemon"); + reparent_to_init(); + journal_info = current->journal_info; + current->journal_info = NULL; + + file = filp_open(debug_daemon_file_path, + O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for logging", debug_daemon_file_path); + GOTO(out1, PTR_ERR(file)); + } else { + printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n", + debug_daemon_file_path); + } + + debug_daemon_state.overlapped = 0; + debug_daemon_state.stopped = 0; + atomic_set(&debug_daemon_state.paused, 0); + oldfs = get_fs(); + set_fs(KERNEL_DS); + while (1) { + unsigned long ending; + unsigned long start, tail; + long delta; + + debug_daemon_state.daemon_event = 0; + + ending = atomic_read(&debug_off_a); + start = atomic_read(&debug_daemon_next_write); + + /* check if paused is imposed by lctl ? */ + force_flush = !debug_daemon_state.lctl_event; + + delta = ending - start; + tail = debug_size - start; + size = (delta >= 0) ? delta : tail; + while (size && (force_flush || (delta < 0) || + (size >= DAEMON_SND_SIZE))) { + if (daemon_file_size_limit) { + int ssize = daemon_file_size_limit - file->f_pos; + if (size > ssize) + size = ssize; + } + + rc = file->f_op->write(file, debug_buf+start, + size, &file->f_pos); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon write error %d\n", rc); + goto out; + } + start += rc; + delta = ending - start; + tail = debug_size - start; + if (tail == 0) + start = 0; + if (delta >= 0) + size = delta; + else + size = (tail == 0) ? ending : tail; + if (daemon_file_size_limit == file->f_pos) { + // file wrapped around + file->f_pos = 0; + } + } + atomic_set(&debug_daemon_next_write, start); + if (force_flush) { + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon sync error %d\n", rc); + goto out; + } + if (debug_daemon_state.stopped) + break; + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + } + wait_event(debug_daemon_state.daemon, + debug_daemon_state.daemon_event); + } +out: + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + set_fs(oldfs); + filp_close(file, 0); + current->journal_info = journal_info; +out1: + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + return 0; +} + +void portals_debug_print(void) +{ + unsigned long dumplen = 64 * 1024; + char *start1, *start2; + char *end1, *end2; + unsigned long debug_off = atomic_read(&debug_off_a); + + start1 = debug_buf + debug_off - dumplen; + if (start1 < debug_buf) { + start1 += debug_size; + end1 = debug_buf + debug_size - 1; + start2 = debug_buf; + end2 = debug_buf + debug_off; + } else { + end1 = debug_buf + debug_off; + start2 = debug_buf + debug_off; + end2 = debug_buf + debug_off; + } + + while (start1 < end1) { + int count = MIN(1024, end1 - start1); + printk("%*s", count, start1); + start1 += 1024; + } + while (start2 < end2) { + int count = MIN(1024, end2 - start2); + printk("%*s", count, start2); + start2 += 1024; + } +} + +void portals_debug_dumplog(void) +{ + int rc; + ENTRY; + + init_waitqueue_head(&debug_ctlwq); + + rc = kernel_thread(portals_do_debug_dumplog, + NULL, CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + printk(KERN_ERR "cannot start dump thread\n"); + return; + } + sleep_on(&debug_ctlwq); +} + +int portals_debug_daemon_start(char *file, unsigned int size) +{ + int rc; + + if (!debug_daemon_state.stopped) + return -EALREADY; + + if (file != NULL) + strncpy(debug_daemon_file_path, file, 1024); + + init_waitqueue_head(&debug_daemon_state.lctl); + init_waitqueue_head(&debug_daemon_state.daemon); + + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + + daemon_file_size_limit = size << 20; + + debug_daemon_state.lctl_event = 0; + rc = kernel_thread(portals_debug_daemon, NULL, 0); + if (rc < 0) { + printk(KERN_ERR "cannot start debug daemon thread\n"); + strncpy(debug_daemon_file_path, "\0", 1); + return rc; + } + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_pause(void) +{ + if (atomic_read(&debug_daemon_state.paused)) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.lctl_event = 0; + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_continue(void) +{ + if (!atomic_read(&debug_daemon_state.paused)) + return -EINVAL; + if (debug_daemon_state.stopped) + return -EINVAL; + + debug_daemon_state.overlapped = 0; + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + atomic_set(&debug_daemon_state.paused, 0); + return 0; +} + +int portals_debug_daemon_stop(void) +{ + if (debug_daemon_state.stopped) + return -EALREADY; + + if (!atomic_read(&debug_daemon_state.paused)) + portals_debug_daemon_pause(); + + debug_daemon_state.lctl_event = 0; + debug_daemon_state.stopped = 1; + + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + + debug_daemon_file_path[0] = '\0'; + return 0; +} + +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *filename, unsigned int size) +{ + int rc = -EINVAL; + + down(&debug_daemon_semaphore); + switch (cmd) { + case DEBUG_DAEMON_START: + if (length && (filename[length -1] != '\0')) { + CERROR("Invalid filename for debug_daemon\n"); + rc = -EINVAL; + break; + } + rc = portals_debug_daemon_start(filename, size); + break; + case DEBUG_DAEMON_STOP: + rc = portals_debug_daemon_stop(); + break; + case DEBUG_DAEMON_PAUSE: + rc = portals_debug_daemon_pause(); + break; + case DEBUG_DAEMON_CONTINUE: + rc = portals_debug_daemon_continue(); + break; + default: + CERROR("unknown set_daemon cmd\n"); + } + up(&debug_daemon_semaphore); + return rc; +} + +static int panic_dumplog(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (handled_panic) + return 0; + else + handled_panic = 1; + + if (in_interrupt()) { + portals_debug_print(); + return 0; + } + + while (current->lock_depth >= 0) + unlock_kernel(); + portals_debug_dumplog(); + return 0; +} + +static struct notifier_block lustre_panic_notifier = { + notifier_call : panic_dumplog, + next : NULL, + priority : 10000 +}; + +int portals_debug_init(unsigned long bufsize) +{ + unsigned long debug_off = atomic_read(&debug_off_a); + if (debug_buf != NULL) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + + debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW); + if (debug_buf == NULL) + return -ENOMEM; + memset(debug_buf, 0, debug_size); + debug_wrapped = 0; + + printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n", + bufsize, debug_buf); + atomic_set(&debug_off_a, debug_off); + notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); + debug_size = bufsize; + + return 0; +} + +int portals_debug_cleanup(void) +{ + notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + portals_debug_daemon_stop(); + + vfree(debug_buf); + atomic_set(&debug_off_a, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +int portals_debug_clear_buffer(void) +{ + unsigned long flags; + unsigned long state; + + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + state = atomic_read(&debug_daemon_state.paused); + if (!state) + portals_debug_daemon_pause(); + spin_lock_irqsave(&portals_debug_lock, flags); + atomic_set(&debug_off_a, 0); + debug_wrapped = 0; + atomic_set(&debug_daemon_next_write, 0); + debug_daemon_state.overlapped = 0; + spin_unlock_irqrestore(&portals_debug_lock, flags); + + if (!state) + atomic_set(&debug_daemon_state.paused, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +/* Debug markers, although printed by S_PORTALS + * should not be be marked as such. + */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int portals_debug_mark_buffer(char *text) +{ + if (debug_buf == NULL) + return -EINVAL; + + CDEBUG(0, "*******************************************************************************\n"); + CDEBUG(0, "DEBUG MARKER: %s\n", text); + CDEBUG(0, "*******************************************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_PORTALS + +__s32 portals_debug_copy_to_user(char *buf, unsigned long len) +{ + int rc; + unsigned long debug_off; + unsigned long flags; + + if (len < debug_size) + return -ENOSPC; + + debug_off = atomic_read(&debug_off_a); + spin_lock_irqsave(&portals_debug_lock, flags); + if (debug_wrapped) { + /* All of this juggling with the 1s is to keep the trailing nul + * (which falls at debug_buf + debug_off) at the end of what we + * copy into user space */ + copy_to_user(buf, debug_buf + debug_off + 1, + debug_size - debug_off - 1); + copy_to_user(buf + debug_size - debug_off - 1, + debug_buf, debug_off + 1); + rc = debug_size; + } else { + copy_to_user(buf, debug_buf, debug_off); + rc = debug_off; + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + + return rc; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) +{ + va_list ap; + unsigned long flags; + int max_nob; + int prefix_nob; + int msg_nob; + struct timeval tv; + unsigned long base_offset; + unsigned long debug_off; + + if (debug_buf == NULL) { + printk("portals_debug_msg: debug_buf is NULL!\n"); + return; + } + + spin_lock_irqsave(&portals_debug_lock, flags); + debug_off = atomic_read(&debug_off_a); + if (!atomic_read(&debug_daemon_state.paused)) { + unsigned long available; + long delta; + long v = atomic_read(&debug_daemon_next_write); + + delta = debug_off - v; + available = (delta>=0) ? debug_size-delta : -delta; + // Check if we still have enough debug buffer for CDEBUG + if (available < DAEMON_SND_SIZE) { + /* Drop CDEBUG packets until enough debug_buffer is + * available */ + if (debug_daemon_state.overlapped) + goto out; + /* If this is the first time, leave a marker in the + * output */ + debug_daemon_state.overlapped = 1; + ap = NULL; + format = "DEBUG MARKER: Debug buffer overlapped\n"; + } else /* More space just became available */ + debug_daemon_state.overlapped = 0; + } + + max_nob = debug_size - debug_off + DEBUG_OVERFLOW; + if (max_nob <= 0) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + printk("logic error in portals_debug_msg: <0 bytes to write\n"); + return; + } + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + do_gettimeofday(&tv); + + prefix_nob = snprintf(debug_buf + debug_off, max_nob, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id(), + tv.tv_sec, tv.tv_usec); + max_nob -= prefix_nob; + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.extern_pid, stack); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.mode.tt.extern_pid, stack); +#else + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d+%lu): ", + file, line, fn, current->pid, stack); +#endif + max_nob -= msg_nob; + + va_start(ap, format); + msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); + + /* Print to console, while msg is contiguous in debug_buf */ + /* NB safely terminated see above */ + if ((mask & D_EMERG) != 0) + printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob); + if ((mask & D_ERROR) != 0) + printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob); + else if (portal_printk) + printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob); + base_offset = debug_off & 0xFFFF; + + debug_off += prefix_nob + msg_nob; + if (debug_off > debug_size) { + memcpy(debug_buf, debug_buf + debug_size, + debug_off - debug_size + 1); + debug_off -= debug_size; + debug_wrapped = 1; + } + + atomic_set(&debug_off_a, debug_off); + if (!atomic_read(&debug_daemon_state.paused) && + ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) { + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + } +out: + spin_unlock_irqrestore(&portals_debug_lock, flags); +} + +void portals_debug_set_level(unsigned int debug_level) +{ + printk("Setting portals debug level to %08x\n", debug_level); + portal_debug = debug_level; +} + +void portals_run_lbug_upcall(char * file, char *fn, int line) +{ + char *argv[6]; + char *envp[3]; + char buf[32]; + int rc; + + ENTRY; + snprintf (buf, sizeof buf, "%d", line); + + argv[0] = portals_upcall; + argv[1] = "LBUG"; + argv[2] = file; + argv[3] = fn; + argv[4] = buf; + argv[5] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " + "/proc/sys/portals/upcall\n", + argv[0], argv[1], argv[2], argv[3], argv[4], rc); + + } else { + CERROR("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); + } +} + + +EXPORT_SYMBOL(portals_debug_dumplog); +EXPORT_SYMBOL(portals_debug_msg); +EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c new file mode 100644 index 0000000..1b9e5bb --- /dev/null +++ b/lnet/libcfs/module.c @@ -0,0 +1,572 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define PORTAL_MINOR 240 + +extern void (kping_client)(struct portal_ioctl_data *); + +struct nal_cmd_handler { + nal_cmd_handler_t nch_handler; + void * nch_private; +}; + +static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +struct semaphore nal_cmd_sem; + +#ifdef PORTAL_DEBUG +void +kportal_assertion_failed (char *expr, char *file, char *func, int line) +{ + unsigned long stack = CDEBUG_STACK(stack); + portals_debug_msg(0, D_EMERG, file, func, line, stack, + "ASSERTION(%s) failed\n", expr); + LBUG(); +} +#endif + +void +kportal_daemonize (char *str) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) + daemonize(str); +#else + daemonize(); + snprintf (current->comm, sizeof (current->comm), "%s", str); +#endif +} + +void +kportal_blockallsigs () +{ + unsigned long flags; + + spin_lock_irqsave (¤t->sigmask_lock, flags); + siginitsetinv (¤t->blocked, 0); + recalc_sigpending (current); + spin_unlock_irqrestore (¤t->sigmask_lock, flags); +} + +/* called when opening /dev/device */ +static int kportal_psdev_open(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + PORTAL_MODULE_USE; + RETURN(0); +} + +/* called when closing /dev/device */ +static int kportal_psdev_release(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + + PORTAL_MODULE_UNUSE; + RETURN(0); +} + +static inline void freedata(void *data, int len) +{ + PORTAL_FREE(data, len); +} + +static int +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_del_route(ptl_nid_t target) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_del_route (target); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) +{ + int gateway_nalid; + ptl_nid_t gateway_nid; + ptl_nid_t lo_nid; + ptl_nid_t hi_nid; + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, + &hi_nid); + + if (rc == 0) { + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + + *gateway_nalidp = (__u32)gateway_nalid; + *gateway_nidp = (__u32)gateway_nid; + *lo_nidp = (__u32)lo_nid; + *hi_nidp = (__u32)hi_nid; + } + + PORTAL_SYMBOL_PUT (kpr_control_interface); + return (rc); +} + +static int +kportal_nal_cmd(int nal, struct portal_ioctl_data *data) +{ + int rc = -EINVAL; + + ENTRY; + + down(&nal_cmd_sem); + if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd); + rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private); + } + up(&nal_cmd_sem); + RETURN(rc); +} + +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + return (PORTAL_SYMBOL_GET(kqswnal_ni)); + case SOCKNAL: + return (PORTAL_SYMBOL_GET(ksocknal_ni)); + case TOENAL: + return (PORTAL_SYMBOL_GET(ktoenal_ni)); + case GMNAL: + return (PORTAL_SYMBOL_GET(kgmnal_ni)); + case TCPNAL: + /* userspace NAL */ + return (NULL); + case SCIMACNAL: + return (PORTAL_SYMBOL_GET(kscimacnal_ni)); + default: + /* A warning to a naive caller */ + CERROR ("unknown nal: %d\n", nal); + return (NULL); + } +} + +void +kportal_put_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + PORTAL_SYMBOL_PUT(kqswnal_ni); + break; + case SOCKNAL: + PORTAL_SYMBOL_PUT(ksocknal_ni); + break; + case TOENAL: + PORTAL_SYMBOL_PUT(ktoenal_ni); + break; + case GMNAL: + PORTAL_SYMBOL_PUT(kgmnal_ni); + break; + case TCPNAL: + /* A lesson to a malicious caller */ + LBUG (); + case SCIMACNAL: + PORTAL_SYMBOL_PUT(kscimacnal_ni); + break; + default: + CERROR ("unknown nal: %d\n", nal); + } +} + +int +kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + if (nal_cmd[nal].nch_handler != NULL) + rc = -EBUSY; + else { + nal_cmd[nal].nch_handler = handler; + nal_cmd[nal].nch_private = private; + } + up(&nal_cmd_sem); + } + return rc; +} + +int +kportal_nal_unregister(int nal) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + nal_cmd[nal].nch_handler = NULL; + nal_cmd[nal].nch_private = NULL; + up(&nal_cmd_sem); + } + return rc; +} + + +static int kportal_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + char buf[1024]; + struct portal_ioctl_data *data; + + ENTRY; + + if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || + _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || + _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + RETURN(-EINVAL); + } + + if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + RETURN(-EINVAL); + } + + data = (struct portal_ioctl_data *)buf; + + switch (cmd) { + case IOC_PORTAL_SET_DAEMON: + RETURN (portals_debug_set_daemon ( + (unsigned int) data->ioc_count, + (unsigned int) data->ioc_inllen1, + (char *) data->ioc_inlbuf1, + (unsigned int) data->ioc_misc)); + case IOC_PORTAL_GET_DEBUG: { + __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1, + data->ioc_plen1); + + if (size < 0) + RETURN(size); + + data->ioc_size = size; + err = copy_to_user((char *)arg, data, sizeof(*data)); + RETURN(err); + } + case IOC_PORTAL_CLEAR_DEBUG: + portals_debug_clear_buffer(); + RETURN(0); + case IOC_PORTAL_PANIC: + if (!capable (CAP_SYS_BOOT)) + RETURN (-EPERM); + panic("debugctl-invoked panic"); + RETURN(0); + case IOC_PORTAL_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + RETURN(-EINVAL); + portals_debug_mark_buffer(data->ioc_inlbuf1); + RETURN(0); + case IOC_PORTAL_PING: { + void (*ping)(struct portal_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n", + data->ioc_count, data->ioc_nid); + ping = PORTAL_SYMBOL_GET(kping_client); + if (!ping) + CERROR("PORTAL_SYMBOL_GET failed\n"); + else { + ping(data); + PORTAL_SYMBOL_PUT(kping_client); + } + RETURN(0); + } + + case IOC_PORTAL_ADD_ROUTE: + CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, data->ioc_nid2, + data->ioc_nid3); + err = kportal_add_route(data->ioc_nal, data->ioc_nid, + MIN (data->ioc_nid2, data->ioc_nid3), + MAX (data->ioc_nid2, data->ioc_nid3)); + break; + + case IOC_PORTAL_DEL_ROUTE: + CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); + err = kportal_del_route (data->ioc_nid); + break; + + case IOC_PORTAL_GET_ROUTE: + CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); + err = kportal_get_route(data->ioc_count, &data->ioc_nal, + &data->ioc_nid, &data->ioc_nid2, + &data->ioc_nid3); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_GET_NID: { + const ptl_handle_ni_t *nip; + ptl_process_id_t pid; + + CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + RETURN (-EINVAL); + + err = PtlGetId (*nip, &pid); + LASSERT (err == PTL_OK); + kportal_put_ni (data->ioc_nal); + + data->ioc_nid = pid.nid; + if (copy_to_user ((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + } + + case IOC_PORTAL_NAL_CMD: + CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal, + data->ioc_nal_cmd); + err = kportal_nal_cmd(data->ioc_nal, data); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_FAIL_NID: { + const ptl_handle_ni_t *nip; + + CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", + data->ioc_nal, data->ioc_nid, data->ioc_count); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + return (-EINVAL); + + err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count); + break; + } + + default: + err = -EINVAL; + break; + } + + RETURN(err); +} + + +static struct file_operations portalsdev_fops = { + ioctl: kportal_ioctl, + open: kportal_psdev_open, + release: kportal_psdev_release +}; + + +static struct miscdevice portal_dev = { + PORTAL_MINOR, + "portals", + &portalsdev_fops +}; + +extern int insert_proc(void); +extern void remove_proc(void); +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +static int init_kportals_module(void) +{ + int rc; + + rc = portals_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "portals_debug_init: %d\n", rc); + return (rc); + } + + sema_init(&nal_cmd_sem, 1); + + rc = misc_register(&portal_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_debug; + } + + rc = PtlInit(); + if (rc) { + CERROR("PtlInit: error %d\n", rc); + goto cleanup_deregister; + } + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_fini; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return (0); + + cleanup_fini: + PtlFini(); + cleanup_deregister: + misc_deregister(&portal_dev); + cleanup_debug: + portals_debug_cleanup(); + return rc; +} + +static void exit_kportals_module(void) +{ + int rc; + + remove_proc(); + PtlFini(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&portal_kmemory)); + + + rc = misc_deregister(&portal_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + + if (atomic_read(&portal_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&portal_kmemory)); + + rc = portals_debug_cleanup(); + if (rc) + printk(KERN_ERR "portals_debug_cleanup: %d\n", rc); +} + +EXPORT_SYMBOL(lib_dispatch); +EXPORT_SYMBOL(PtlMEAttach); +EXPORT_SYMBOL(PtlMEInsert); +EXPORT_SYMBOL(PtlMEUnlink); +EXPORT_SYMBOL(PtlEQAlloc); +EXPORT_SYMBOL(PtlMDAttach); +EXPORT_SYMBOL(PtlMDUnlink); +EXPORT_SYMBOL(PtlNIInit); +EXPORT_SYMBOL(PtlNIFini); +EXPORT_SYMBOL(PtlNIDebug); +EXPORT_SYMBOL(PtlInit); +EXPORT_SYMBOL(PtlFini); +EXPORT_SYMBOL(PtlPut); +EXPORT_SYMBOL(PtlGet); +EXPORT_SYMBOL(ptl_err_str); +EXPORT_SYMBOL(portal_subsystem_debug); +EXPORT_SYMBOL(portal_debug); +EXPORT_SYMBOL(portal_stack); +EXPORT_SYMBOL(portal_printk); +EXPORT_SYMBOL(PtlEQWait); +EXPORT_SYMBOL(PtlEQFree); +EXPORT_SYMBOL(PtlEQGet); +EXPORT_SYMBOL(PtlGetId); +EXPORT_SYMBOL(PtlMDBind); +EXPORT_SYMBOL(lib_iov_nob); +EXPORT_SYMBOL(lib_copy_iov2buf); +EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_kiov_nob); +EXPORT_SYMBOL(lib_copy_kiov2buf); +EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_finalize); +EXPORT_SYMBOL(lib_parse); +EXPORT_SYMBOL(lib_init); +EXPORT_SYMBOL(lib_fini); +EXPORT_SYMBOL(portal_kmemory); +EXPORT_SYMBOL(kportal_daemonize); +EXPORT_SYMBOL(kportal_blockallsigs); +EXPORT_SYMBOL(kportal_nal_register); +EXPORT_SYMBOL(kportal_nal_unregister); +EXPORT_SYMBOL(kportal_assertion_failed); +EXPORT_SYMBOL(dispatch_name); +EXPORT_SYMBOL(kportal_get_ni); +EXPORT_SYMBOL(kportal_put_ni); + +module_init(init_kportals_module); +module_exit (exit_kportals_module); diff --git a/lnet/libcfs/proc.c b/lnet/libcfs/proc.c new file mode 100644 index 0000000..2fa739a --- /dev/null +++ b/lnet/libcfs/proc.c @@ -0,0 +1,290 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + +static struct ctl_table_header *portals_table_header = NULL; +extern char debug_file_path[1024]; +extern char debug_daemon_file_path[1024]; +extern char portals_upcall[1024]; + +#define PSDEV_PORTALS (0x100) +#define PSDEV_DEBUG 1 /* control debugging */ +#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ +#define PSDEV_PRINTK 3 /* force all errors to console */ +#define PSDEV_DEBUG_PATH 4 /* crashdump log location */ +#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */ +#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */ + +#define PORTALS_PRIMARY_CTLCNT 6 +static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { + {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, + sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, + {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path, + sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring, + &sysctl_string}, + {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, + sizeof(portals_upcall), 0644, NULL, &proc_dostring, + &sysctl_string}, + {0} +}; + +static struct ctl_table top_table[2] = { + {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, + {0} +}; + + +#ifdef PORTALS_PROFILING +/* + * profiling stuff. we do this statically for now 'cause its simple, + * but we could do some tricks with elf sections to have this array + * automatically built. + */ +#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } + +struct prof_ent prof_ents[] = { + def_prof(our_recvmsg), + def_prof(our_sendmsg), + def_prof(socknal_recv), + def_prof(lib_parse), + def_prof(conn_list_walk), + def_prof(memcpy), + def_prof(lib_finalize), + def_prof(pingcli_time), + def_prof(gmnal_send), + def_prof(gmnal_recv), +}; + +EXPORT_SYMBOL(prof_ents); + +/* + * this function is as crazy as the proc filling api + * requires. + * + * buffer: page allocated for us to scribble in. the + * data returned to the user will be taken from here. + * *start: address of the pointer that will tell the + * caller where in buffer the data the user wants is. + * ppos: offset in the entire /proc file that the user + * currently wants. + * wanted: the amount of data the user wants. + * + * while going, 'curpos' is the offset in the entire + * file where we currently are. We only actually + * start filling buffer when we get to a place in + * the file that the user cares about. + * + * we take care to only sprintf when the user cares because + * we're holding a lock while we do this. + * + * we're smart and know that we generate fixed size lines. + * we only start writing to the buffer when the user cares. + * This is unpredictable because we don't snapshot the + * list between calls that are filling in a file from + * the list. The list could change mid read and the + * output will look very weird indeed. oh well. + */ + +static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, + int *eof, void *data) +{ + int len = 0, i; + int curpos; + char *header = "Interval Cycles_per (Starts Finishes Total)\n"; + int header_len = strlen(header); + char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; + int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); + + *start = buffer; + + if (ppos < header_len) { + int diff = MIN(header_len, wanted); + memcpy(buffer, header + ppos, diff); + len += diff; + ppos += diff; + } + + if (len >= wanted) + goto out; + + curpos = header_len; + + for ( i = 0; i < MAX_PROFS ; i++) { + int copied; + struct prof_ent *pe = &prof_ents[i]; + long long cycles_per; + /* + * find the part of the array that the buffer wants + */ + if (ppos >= (curpos + line_len)) { + curpos += line_len; + continue; + } + /* the clever caller split a line */ + if (ppos > curpos) { + *start = buffer + (ppos - curpos); + } + + if (pe->finishes == 0) + cycles_per = 0; + else + { + cycles_per = pe->total_cycles; + do_div (cycles_per, pe->finishes); + } + + copied = sprintf(buffer + len, format, pe->str, cycles_per, + pe->starts, pe->finishes, pe->total_cycles); + + len += copied; + + /* pad to line len, -1 for \n */ + if ((copied < line_len-1)) { + int diff = (line_len-1) - copied; + memset(buffer + len, ' ', diff); + len += diff; + copied += diff; + } + + buffer[len++]= '\n'; + + /* bail if we have enough */ + if (((buffer + len) - *start) >= wanted) + break; + + curpos += line_len; + } + + /* lameness */ + if (i == MAX_PROFS) + *eof = 1; + out: + + return MIN(((buffer + len) - *start), wanted); +} + +/* + * all kids love /proc :/ + */ +static unsigned char basedir[]="net/portals"; +#endif /* PORTALS_PROFILING */ + +int insert_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + struct proc_dir_entry *ent; + + if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { + CERROR("profiling enum and array are out of sync.\n"); + return -1; + } + + /* + * This is pretty lame. assuming that failure just + * means that they already existed. + */ + strcat(dir, basedir); + create_proc_entry(dir, S_IFDIR, 0); + + strcat(dir, "/cycles"); + ent = create_proc_entry(dir, 0, 0); + if (!ent) { + CERROR("couldn't register %s?\n", dir); + return -1; + } + + ent->data = NULL; + ent->read_proc = prof_read_proc; +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (!portals_table_header) + portals_table_header = register_sysctl_table(top_table, 0); +#endif + + return 0; +} + +void remove_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + int end; + + dir[0]='\0'; + strcat(dir, basedir); + + end = strlen(dir); + + strcat(dir, "/cycles"); + remove_proc_entry(dir,0); + + dir[end] = '\0'; + remove_proc_entry(dir,0); +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (portals_table_header) + unregister_sysctl_table(portals_table_header); + portals_table_header = NULL; +#endif +} diff --git a/lnet/lnet/Makefile.am b/lnet/lnet/Makefile.am new file mode 100644 index 0000000..9fb7f6f --- /dev/null +++ b/lnet/lnet/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include +lib_LIBRARIES= libportals.a +libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-md.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-not-impl.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c diff --git a/lnet/lnet/Makefile.mk b/lnet/lnet/Makefile.mk new file mode 100644 index 0000000..5627ef7 --- /dev/null +++ b/lnet/lnet/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += portals.o +portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c new file mode 100644 index 0000000..57427f6 --- /dev/null +++ b/lnet/lnet/api-eq.c @@ -0,0 +1,161 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-eq.c + * User-level event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * PtlMDUpdate is here so that it can access the per-eventq + * structures. + */ + +#include + +int ptl_eq_init(void) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_fini(void) +{ + /* Nothing to do anymore... */ +} + +int ptl_eq_ni_init(nal_t * nal) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_ni_fini(nal_t * nal) +{ + /* Nothing to do anymore... */ +} + +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) +{ + ptl_eq_t *eq; + int rc, new_index; + unsigned long flags; + ptl_event_t *new_event; + nal_t *nal; + ENTRY; + + if (!ptl_init) + RETURN(PTL_NOINIT); + + nal = ptl_hndl2nal(&eventq); + if (!nal) + RETURN(PTL_INV_EQ); + + eq = ptl_handle2usereq(&eventq); + nal->lock(nal, &flags); + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + + new_index = eq->sequence & (eq->size - 1); + new_event = &eq->base[new_index]; + CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->sequence, eq->size); + if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { + nal->unlock(nal, &flags); + RETURN(PTL_EQ_EMPTY); + } + + *ev = *new_event; + + /* Set the unlinked_me interface number if there is one to pass + * back, since the NAL hasn't a clue what it is and therefore can't + * set it. */ + if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) + ev->unlinked_me.nal_idx = eventq.nal_idx; + + /* ensure event is delivered correctly despite possible + races with lib_finalize */ + if (eq->sequence != new_event->sequence) { + CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", + eq->sequence, new_event->sequence); + rc = PTL_EQ_DROPPED; + } else { + rc = PTL_OK; + } + + eq->sequence = new_event->sequence + 1; + nal->unlock(nal, &flags); + RETURN(rc); +} + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) +{ + int rc; + + /* PtlEQGet does the handle checking */ + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + } + + return rc; +} + +#ifndef __KERNEL__ +static jmp_buf eq_jumpbuf; + +static void eq_timeout(int signal) +{ + longjmp(eq_jumpbuf, -1); +} + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + static void (*prev) (int); + static int left_over; + time_t time_at_start; + int rc; + + if (setjmp(eq_jumpbuf)) { + signal(SIGALRM, prev); + alarm(left_over - timeout); + return PTL_EQ_EMPTY; + } + + left_over = alarm(timeout); + prev = signal(SIGALRM, eq_timeout); + time_at_start = time(NULL); + if (left_over < timeout) + alarm(left_over); + + rc = PtlEQWait(eventq_in, event_out); + + signal(SIGALRM, prev); + alarm(left_over); /* Should compute how long we waited */ + + return rc; +} + +#endif + diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c new file mode 100644 index 0000000..5cb0980 --- /dev/null +++ b/lnet/lnet/api-errno.c @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-errno.c + * Instantiate the string table of errors + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +/* If you change these, you must update the number table in portals/errno.h */ +const char *ptl_err_str[] = { + "PTL_OK", + "PTL_SEGV", + + "PTL_NOSPACE", + "PTL_INUSE", + "PTL_VAL_FAILED", + + "PTL_NAL_FAILED", + "PTL_NOINIT", + "PTL_INIT_DUP", + "PTL_INIT_INV", + "PTL_AC_INV_INDEX", + + "PTL_INV_ASIZE", + "PTL_INV_HANDLE", + "PTL_INV_MD", + "PTL_INV_ME", + "PTL_INV_NI", +/* If you change these, you must update the number table in portals/errno.h */ + "PTL_ILL_MD", + "PTL_INV_PROC", + "PTL_INV_PSIZE", + "PTL_INV_PTINDEX", + "PTL_INV_REG", + + "PTL_INV_SR_INDX", + "PTL_ML_TOOLONG", + "PTL_ADDR_UNKNOWN", + "PTL_INV_EQ", + "PTL_EQ_DROPPED", + + "PTL_EQ_EMPTY", + "PTL_NOUPDATE", + "PTL_FAIL", + "PTL_NOT_IMPLEMENTED", + "PTL_NO_ACK", + + "PTL_IOV_TOO_MANY", + "PTL_IOV_TOO_SMALL", + + "PTL_EQ_INUSE", + "PTL_MD_INUSE" +}; +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c new file mode 100644 index 0000000..b54f684 --- /dev/null +++ b/lnet/lnet/api-init.c @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-init.c + * Initialization and global data for the p30 user side library + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * All handles have their interface number stored in the second 16 bit word + */ + +#include + +int ptl_init; +unsigned int portal_subsystem_debug = 0xfff7e3ff; +unsigned int portal_debug = ~0; +unsigned int portal_printk; +unsigned int portal_stack; + +#ifdef __KERNEL__ +atomic_t portal_kmemory = ATOMIC_INIT(0); +#endif + +int __p30_initialized; +int __p30_myr_initialized; +int __p30_ip_initialized; +ptl_handle_ni_t __myr_ni_handle; +ptl_handle_ni_t __ip_ni_handle; + +int __p30_myr_timeout = 10; +int __p30_ip_timeout; + +int PtlInit(void) +{ + + if (ptl_init) + return PTL_OK; + + ptl_ni_init(); + ptl_me_init(); + ptl_eq_init(); + ptl_init = 1; + __p30_initialized = 1; + + return PTL_OK; +} + + +void PtlFini(void) +{ + + /* Reverse order of initialization */ + ptl_eq_fini(); + ptl_me_fini(); + ptl_ni_fini(); + ptl_init = 0; +} diff --git a/lnet/lnet/api-md.c b/lnet/lnet/api-md.c new file mode 100644 index 0000000..967112f --- /dev/null +++ b/lnet/lnet/api-md.c @@ -0,0 +1,9 @@ +/* + * api-p30/md.c + * + * Memory descriptor functions that need address validation + * There are a few standing issues... + * - Addresses are invalidated by the library without telling us. + */ +#include + diff --git a/lnet/lnet/api-me.c b/lnet/lnet/api-me.c new file mode 100644 index 0000000..573e948 --- /dev/null +++ b/lnet/lnet/api-me.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-me.c + * Match Entry local operations. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +int ptl_me_init(void) +{ + return PTL_OK; +} +void ptl_me_fini(void) +{ /* Nothing to do */ +} +int ptl_me_ni_init(nal_t * nal) +{ + return PTL_OK; +} + +void ptl_me_ni_fini(nal_t * nal) +{ /* Nothing to do... */ +} diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c new file mode 100644 index 0000000..952da4f --- /dev/null +++ b/lnet/lnet/api-ni.c @@ -0,0 +1,184 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-ni.c + * Network Interface code + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +#define MAX_NIS 8 +static nal_t *ptl_interfaces[MAX_NIS]; +int ptl_num_interfaces = 0; + +nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) +{ + unsigned int idx = handle->nal_idx; + + /* XXX we really rely on the caller NOT racing with interface + * setup/teardown. That ensures her NI handle can't get + * invalidated out from under her (or worse, swapped for a + * completely different interface!) */ + + if (idx < MAX_NIS) + return ptl_interfaces[idx]; + + return NULL; +} + +int ptl_ni_init(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) + ptl_interfaces[i] = NULL; + + return PTL_OK; +} + +void ptl_ni_fini(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) { + nal_t *nal = ptl_interfaces[i]; + if (!nal) + continue; + + if (nal->shutdown) + nal->shutdown(nal, i); + } +} + +#ifdef __KERNEL__ +DECLARE_MUTEX(ptl_ni_init_mutex); + +static void ptl_ni_init_mutex_enter (void) +{ + down (&ptl_ni_init_mutex); +} + +static void ptl_ni_init_mutex_exit (void) +{ + up (&ptl_ni_init_mutex); +} + +#else +static void ptl_ni_init_mutex_enter (void) +{ +} + +static void ptl_ni_init_mutex_exit (void) +{ +} + +#endif + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, ptl_pid_t requested_pid, + ptl_handle_ni_t * handle) +{ + nal_t *nal; + int i; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid); + + if (!nal) { + ptl_ni_init_mutex_exit (); + return PTL_NAL_FAILED; + } + + for (i = 0; i < ptl_num_interfaces; i++) { + if (ptl_interfaces[i] == nal) { + nal->refct++; + handle->nal_idx = i; + fprintf(stderr, "Returning existing NAL (%d)\n", i); + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + } + nal->refct = 1; + + handle->nal_idx = ptl_num_interfaces; + if (ptl_num_interfaces >= MAX_NIS) { + if (nal->shutdown) + nal->shutdown (nal, ptl_num_interfaces); + ptl_ni_init_mutex_exit (); + return PTL_NOSPACE; + } + + ptl_interfaces[ptl_num_interfaces++] = nal; + + ptl_eq_ni_init(nal); + ptl_me_ni_init(nal); + + ptl_ni_init_mutex_exit (); + return PTL_OK; +} + + +int PtlNIFini(ptl_handle_ni_t ni) +{ + nal_t *nal; + int rc; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = ptl_hndl2nal (&ni); + if (nal == NULL) { + ptl_ni_init_mutex_exit (); + return PTL_INV_HANDLE; + } + + nal->refct--; + if (nal->refct > 0) { + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + + ptl_me_ni_fini(nal); + ptl_eq_ni_fini(nal); + + rc = PTL_OK; + if (nal->shutdown) + rc = nal->shutdown(nal, ni.nal_idx); + + ptl_interfaces[ni.nal_idx] = NULL; + ptl_num_interfaces--; + + ptl_ni_init_mutex_exit (); + return rc; +} + +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) +{ + *ni_out = handle_in; + + return PTL_OK; +} diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c new file mode 100644 index 0000000..cbd4d1f --- /dev/null +++ b/lnet/lnet/api-wrap.c @@ -0,0 +1,601 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-wrap.c + * User-level wrappers that dispatch across the protection boundaries + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Assumes the handle encodes the network number in the second 16 bit word + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, + int argsize, void *retbuf, int retsize) +{ + nal_t *nal; + + if (!ptl_init) { + fprintf(stderr, "PtlGetId: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(&any_h); + if (!nal) + return PTL_INV_HANDLE; + + nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); + + return PTL_OK; +} + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) +{ + PtlGetId_in args; + PtlGetId_out ret; + int rc; + + args.handle_in = ni_handle; + + rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return rc; + + if (id) + *id = ret.id_out; + + return ret.rc; +} + +int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) +{ + PtlFailNid_in args; + PtlFailNid_out ret; + int rc; + + args.interface = interface; + args.nid = nid; + args.threshold = threshold; + + rc = do_forward (interface, PTL_FAILNID, + &args, sizeof(args), &ret, sizeof (ret)); + + return ((rc != PTL_OK) ? rc : ret.rc); +} + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out) +{ + PtlNIStatus_in args; + PtlNIStatus_out ret; + int rc; + + args.interface_in = interface_in; + args.register_in = register_in; + + rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (status_out) + *status_out = ret.status_out; + + return ret.rc; +} + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out) +{ + PtlNIDist_in args; + PtlNIDist_out ret; + int rc; + + args.interface_in = interface_in; + args.process_in = process_in; + + rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (distance_out) + *distance_out = ret.distance_out; + + return ret.rc; +} + + + +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in) +{ + PtlNIDebug_in args; + PtlNIDebug_out ret; + int rc; + + args.mask_in = mask_in; + + rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) +{ + PtlMEAttach_in args; + PtlMEAttach_out ret; + int rc; + + args.interface_in = interface_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = pos_in; + + rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = interface_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + + return ret.rc; +} + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out) +{ + PtlMEInsert_in args; + PtlMEInsert_out ret; + int rc; + + args.current_in = current_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = position_in; + + rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = current_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMEUnlink(ptl_handle_me_t current_in) +{ + PtlMEUnlink_in args; + PtlMEUnlink_out ret; + int rc; + + args.current_in = current_in; + args.unlink_in = PTL_RETAIN; + + rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +int PtlTblDump(ptl_handle_ni_t ni, int index_in) +{ + PtlTblDump_in args; + PtlTblDump_out ret; + int rc; + + args.index_in = index_in; + + rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEDump(ptl_handle_me_t current_in) +{ + PtlMEDump_in args; + PtlMEDump_out ret; + int rc; + + args.current_in = current_in; + + rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) +{ + nal_t *nal; + int rc; + int i; + + if (!ptl_init) { + fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(¤t_in); + if (!nal) + return PTL_INV_HANDLE; + + if (nal->validate != NULL) /* nal->validate not a NOOP */ + { + if ((md_in.options & PTL_MD_IOV) == 0) /* contiguous */ + { + rc = nal->validate (nal, md_in.start, md_in.length); + if (rc) + return (PTL_SEGV); + } + else + { + struct iovec *iov = (struct iovec *)md_in.start; + + for (i = 0; i < md_in.niov; i++, iov++) + { + rc = nal->validate (nal, iov->iov_base, iov->iov_len); + if (rc) + return (PTL_SEGV); + } + } + } + + return 0; +} + +static ptl_handle_eq_t md2eq (ptl_md_t *md) +{ + if (PtlHandleEqual (md->eventq, PTL_EQ_NONE)) + return (PTL_EQ_NONE); + + return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); +} + + +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) +{ + PtlMDAttach_in args; + PtlMDAttach_out ret; + int rc; + + rc = validate_md(me_in, md_in); + if (rc == PTL_OK) { + args.eq_in = md2eq(&md_in); + args.me_in = me_in; + args.md_in = md_in; + args.unlink_in = unlink_in; + + rc = do_forward(me_in, PTL_MDATTACH, + &args, sizeof(args), &ret, sizeof(ret)); + } + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = me_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + + + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out) +{ + PtlMDBind_in args; + PtlMDBind_out ret; + int rc; + + rc = validate_md(ni_in, md_in); + if (rc != PTL_OK) + return rc; + + args.eq_in = md2eq(&md_in); + args.ni_in = ni_in; + args.md_in = md_in; + + rc = do_forward(ni_in, PTL_MDBIND, + &args, sizeof(args), &ret, sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = ni_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +{ + PtlMDUpdate_internal_in args; + PtlMDUpdate_internal_out ret; + int rc; + + args.md_in = md_in; + + if (old_inout) { + args.old_inout = *old_inout; + args.old_inout_valid = 1; + } else + args.old_inout_valid = 0; + + if (new_inout) { + rc = validate_md (md_in, *new_inout); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + args.new_inout = *new_inout; + args.new_inout_valid = 1; + } else + args.new_inout_valid = 0; + + if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) { + args.testq_in = PTL_EQ_NONE; + args.sequence_in = -1; + } else { + ptl_eq_t *eq = ptl_handle2usereq (&testq_in); + + args.testq_in = eq->cb_eq_handle; + args.sequence_in = eq->sequence; + } + + rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + if (old_inout) + *old_inout = ret.old_inout; + + return ret.rc; +} + +int PtlMDUnlink(ptl_handle_md_t md_in) +{ + PtlMDUnlink_in args; + PtlMDUnlink_out ret; + int rc; + + args.md_in = md_in; + rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + return ret.rc; +} + +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out) +{ + ptl_eq_t *eq = NULL; + ptl_event_t *ev = NULL; + PtlEQAlloc_in args; + PtlEQAlloc_out ret; + int rc, i; + nal_t *nal; + + if (!ptl_init) + return PTL_NOINIT; + + nal = ptl_hndl2nal (&interface); + if (nal == NULL) + return PTL_INV_HANDLE; + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + + PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); + if (!ev) + return PTL_NOSPACE; + + for (i = 0; i < count; i++) + ev[i].sequence = 0; + + if (nal->validate != NULL) { + rc = nal->validate(nal, ev, count * sizeof(ptl_event_t)); + if (rc != PTL_OK) + goto fail; + } + + args.ni_in = interface; + args.count_in = count; + args.base_in = ev; + args.len_in = count * sizeof(*ev); + args.callback_in = callback; + + rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + goto fail; + if (ret.rc) + GOTO(fail, rc = ret.rc); + + PORTAL_ALLOC(eq, sizeof(*eq)); + if (!eq) { + rc = PTL_NOSPACE; + goto fail; + } + + eq->sequence = 1; + eq->size = count; + eq->base = ev; + + /* EQ handles are a little wierd. PtlEQGet() just looks at the + * queued events in shared memory. It doesn't want to do_forward() + * at all, so the cookie in the EQ handle we pass out of here is + * simply a pointer to the event queue we just set up. We stash + * the handle returned by do_forward(), so we can pass it back via + * do_forward() when we need to. */ + + eq->cb_eq_handle.nal_idx = interface.nal_idx; + eq->cb_eq_handle.cookie = ret.handle_out.cookie; + + handle_out->nal_idx = interface.nal_idx; + handle_out->cookie = (__u64)((unsigned long)eq); + return PTL_OK; + +fail: + PORTAL_FREE(ev, count * sizeof(ptl_event_t)); + return rc; +} + +int PtlEQFree(ptl_handle_eq_t eventq) +{ + PtlEQFree_in args; + PtlEQFree_out ret; + ptl_eq_t *eq; + int rc; + + eq = ptl_handle2usereq (&eventq); + args.eventq_in = eq->cb_eq_handle; + + rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, + sizeof(args), &ret, sizeof(ret)); + + /* XXX we're betting rc == PTL_OK here */ + PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); + PORTAL_FREE(eq, sizeof(*eq)); + + return rc; +} + +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) +{ + PtlACEntry_in args; + PtlACEntry_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.ni_in = ni_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.portal_in = portal_in; + + rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, + sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) +{ + PtlPut_in args; + PtlPut_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.ack_req_in = ack_req_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + args.hdr_data_in = hdr_data_in; + + rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in) +{ + PtlGet_in args; + PtlGet_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + + rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} diff --git a/lnet/lnet/lib-dispatch.c b/lnet/lnet/lib-dispatch.c new file mode 100644 index 0000000..63ed70f --- /dev/null +++ b/lnet/lnet/lib-dispatch.c @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-dispatch.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +typedef struct { + int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); + char *name; +} dispatch_table_t; + +static dispatch_table_t dispatch_table[] = { + [PTL_GETID] {do_PtlGetId, "PtlGetId"}, + [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, + [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, + [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"}, + [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, + [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, + [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, + [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, + [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, + [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, + [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, + [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, + [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, + [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, + [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, + [PTL_ACENTRY] {do_PtlACEntry, "PtlACEntry"}, + [PTL_PUT] {do_PtlPut, "PtlPut"}, + [PTL_GET] {do_PtlGet, "PtlGet"}, + [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, + /* */ {0, ""} +}; + +/* + * This really should be elsewhere, but lib-p30/dispatch.c is + * an automatically generated file. + */ +void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, + void *ret_block) +{ + lib_ni_t *ni = &nal->ni; + + if (index < 0 || index > LIB_MAX_DISPATCH || + !dispatch_table[index].fun) { + CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); + return; + } + + CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, + dispatch_table[index].name, index); + + dispatch_table[index].fun(nal, private, arg_block, ret_block); +} + +char *dispatch_name(int index) +{ + return dispatch_table[index].name; +} diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c new file mode 100644 index 0000000..4c6c292 --- /dev/null +++ b/lnet/lnet/lib-eq.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-eq.c + * Library level Event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_size_t count_in + * void * base_in + * + * Outgoing: + * ptl_handle_eq_t * handle_out + */ + + PtlEQAlloc_in *args = v_args; + PtlEQAlloc_out *ret = v_ret; + + lib_eq_t *eq; + unsigned long flags; + + /* api should have rounded up */ + if (args->count_in != LOWEST_BIT_SET (args->count_in)) + return ret->rc = PTL_VAL_FAILED; + + eq = lib_eq_alloc (nal); + if (eq == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + if (nal->cb_map != NULL) { + struct iovec iov = { + .iov_base = args->base_in, + .iov_len = args->count_in * sizeof (ptl_event_t) }; + + ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); + if (ret->rc != PTL_OK) { + lib_eq_free (nal, eq); + + state_unlock (nal, &flags); + return (ret->rc); + } + } + + eq->sequence = 1; + eq->base = args->base_in; + eq->size = args->count_in; + eq->eq_refcount = 0; + eq->event_callback = args->callback_in; + + lib_initialise_handle (nal, &eq->eq_lh); + list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + + state_unlock(nal, &flags); + + ptl_eq2handle(&ret->handle_out, eq); + return (ret->rc = PTL_OK); +} + +int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_eq_t eventq_in + * + * Outgoing: + */ + + PtlEQFree_in *args = v_args; + PtlEQFree_out *ret = v_ret; + lib_eq_t *eq; + long flags; + + state_lock (nal, &flags); + + eq = ptl_handle2eq(&args->eventq_in, nal); + if (eq == NULL) { + ret->rc = PTL_INV_EQ; + } else if (eq->eq_refcount != 0) { + ret->rc = PTL_EQ_INUSE; + } else { + if (nal->cb_unmap != NULL) { + struct iovec iov = { + .iov_base = eq->base, + .iov_len = eq->size * sizeof (ptl_event_t) }; + + nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + } + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + ret->rc = PTL_OK; + } + + state_unlock (nal, &flags); + + return (ret->rc); +} diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c new file mode 100644 index 0000000..40f3d2c --- /dev/null +++ b/lnet/lnet/lib-init.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-init.c + * Start up the internal library and clear all structures + * Called by the NAL when it initializes. Safe to call multiple times. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +#ifdef __KERNEL__ +# include /* for memset() */ +# include +# ifdef KERNEL_ADDR_CACHE +# include +# endif +#else +# include +# include +#endif + +#ifdef PTL_USE_SLAB_CACHE +static int ptl_slab_users; + +kmem_cache_t *ptl_md_slab; +kmem_cache_t *ptl_msg_slab; +kmem_cache_t *ptl_me_slab; +kmem_cache_t *ptl_eq_slab; + +atomic_t md_in_use_count; +atomic_t msg_in_use_count; +atomic_t me_in_use_count; +atomic_t eq_in_use_count; + +/* NB zeroing in ctor and on freeing ensures items that + * kmem_cache_validate() OK, but haven't been initialised + * as an MD/ME/EQ can't have valid handles + */ +static void +ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_md_t)); +} + +static void +ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_me_t)); +} + +static void +ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_eq_t)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + + /* We'll have 1 set of slabs for ALL the nals :) */ + + if (ptl_slab_users++) + return 0; + + ptl_md_slab = kmem_cache_create("portals_MD", + sizeof(lib_md_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_md_slab_ctor, NULL); + if (!ptl_md_slab) { + CERROR("couldn't allocate ptl_md_t slab"); + RETURN (PTL_NOSPACE); + } + + /* NB no ctor for msgs; they don't need handle verification */ + ptl_msg_slab = kmem_cache_create("portals_MSG", + sizeof(lib_msg_t), 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ptl_msg_slab) { + CERROR("couldn't allocate ptl_msg_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_me_slab = kmem_cache_create("portals_ME", + sizeof(lib_me_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_me_slab_ctor, NULL); + if (!ptl_me_slab) { + CERROR("couldn't allocate ptl_me_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_eq_slab = kmem_cache_create("portals_EQ", + sizeof(lib_eq_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_eq_slab_ctor, NULL); + if (!ptl_eq_slab) { + CERROR("couldn't allocate ptl_eq_t slab"); + RETURN (PTL_NOSPACE); + } + + RETURN(PTL_OK); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + if (--ptl_slab_users != 0) + return; + + LASSERT (atomic_read (&md_in_use_count) == 0); + LASSERT (atomic_read (&me_in_use_count) == 0); + LASSERT (atomic_read (&eq_in_use_count) == 0); + LASSERT (atomic_read (&msg_in_use_count) == 0); + + if (ptl_md_slab != NULL) + kmem_cache_destroy(ptl_md_slab); + if (ptl_msg_slab != NULL) + kmem_cache_destroy(ptl_msg_slab); + if (ptl_me_slab != NULL) + kmem_cache_destroy(ptl_me_slab); + if (ptl_eq_slab != NULL) + kmem_cache_destroy(ptl_eq_slab); +} +#else + +int +lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lib_freeobj_t, fo_contents); + + space = nal->cb_malloc (nal, n * size); + if (space == NULL) + return (PTL_NOSPACE); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (PTL_OK); +} + +void +lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (fl)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + int rc; + + memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); + memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); + memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); + memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + MAX_MES, sizeof (lib_me_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + MAX_MSGS, sizeof (lib_msg_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + MAX_MDS, sizeof (lib_md_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + MAX_EQS, sizeof (lib_eq_t)); + return (rc); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + lib_freelist_fini (nal, &nal->ni.ni_free_mes); + lib_freelist_fini (nal, &nal->ni.ni_free_msgs); + lib_freelist_fini (nal, &nal->ni.ni_free_mds); + lib_freelist_fini (nal, &nal->ni.ni_free_eqs); +} + +#endif + +__u64 +lib_create_interface_cookie (nal_cb_t *nal) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid in a new instance of the same + * interface. Initialisation time, even if it's only implemented + * to millisecond resolution is probably easily good enough. */ + struct timeval tv; + __u64 cookie; +#ifndef __KERNEL__ + int rc = gettimeofday (&tv, NULL); + LASSERT (rc == 0); +#else + do_gettimeofday(&tv); +#endif + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return (cookie); +} + +int +lib_setup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + int i; + + /* Arbitrary choice of hash table size */ +#ifdef __KERNEL__ + ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); +#else + ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; +#endif + ni->ni_lh_hash_table = + (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size + * sizeof (struct list_head)); + if (ni->ni_lh_hash_table == NULL) + return (PTL_NOSPACE); + + for (i = 0; i < ni->ni_lh_hash_size; i++) + INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); + + ni->ni_next_object_cookie = 0; + + return (PTL_OK); +} + +void +lib_cleanup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->ni_lh_hash_table == NULL) + return; + + nal->cb_free (nal, ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); +} + +lib_handle_t * +lib_lookup_cookie (nal_cb_t *nal, __u64 cookie) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + struct list_head *list; + struct list_head *el; + unsigned int hash; + + hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; + list = &ni->ni_lh_hash_table[hash]; + + list_for_each (el, list) { + lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); + + if (lh->lh_cookie == cookie) + return (lh); + } + + return (NULL); +} + +void +lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + unsigned int hash; + + lh->lh_cookie = ni->ni_next_object_cookie++; + hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; + list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); +} + +void +lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + list_del (&lh->lh_hash_chain); +} + +int +lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size) +{ + int rc = PTL_OK; + lib_ni_t *ni = &nal->ni; + int i; + ENTRY; + + /* NB serialised in PtlNIInit() */ + + if (ni->refcnt != 0) { /* already initialised */ + ni->refcnt++; + goto out; + } + + /* + * Allocate the portal table for this interface + * and all per-interface objects. + */ + memset(&ni->counters, 0, sizeof(lib_counters_t)); + + rc = kportal_descriptor_setup (nal); + if (rc != PTL_OK) + goto out; + + INIT_LIST_HEAD (&ni->ni_active_msgs); + INIT_LIST_HEAD (&ni->ni_active_mds); + INIT_LIST_HEAD (&ni->ni_active_eqs); + + INIT_LIST_HEAD (&ni->ni_test_peers); + + ni->ni_interface_cookie = lib_create_interface_cookie (nal); + ni->ni_next_object_cookie = 0; + rc = lib_setup_handle_hash (nal); + if (rc != PTL_OK) + goto out; + + ni->nid = nid; + ni->pid = pid; + + ni->num_nodes = gsize; + ni->tbl.size = ptl_size; + + ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); + if (ni->tbl.tbl == NULL) { + rc = PTL_NOSPACE; + goto out; + } + + for (i = 0; i < ptl_size; i++) + INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + + ni->debug = PTL_DEBUG_NONE; + ni->up = 1; + ni->refcnt++; + + out: + if (rc != PTL_OK) { + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + } + + RETURN (rc); +} + +int +lib_fini(nal_cb_t * nal) +{ + lib_ni_t *ni = &nal->ni; + int idx; + + ni->refcnt--; + + if (ni->refcnt != 0) + goto out; + + /* NB no stat_lock() since this is the last reference. The NAL + * should have shut down already, so it should be safe to unlink + * and free all descriptors, even those that appear committed to a + * network op (eg MD with non-zero pending count) + */ + + for (idx = 0; idx < ni->tbl.size; idx++) + while (!list_empty (&ni->tbl.tbl[idx])) { + lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + lib_me_t, me_list); + + CERROR ("Active me %p on exit\n", me); + list_del (&me->me_list); + lib_me_free (nal, me); + } + + while (!list_empty (&ni->ni_active_mds)) { + lib_md_t *md = list_entry (ni->ni_active_mds.next, + lib_md_t, md_list); + + CERROR ("Active md %p on exit\n", md); + list_del (&md->md_list); + lib_md_free (nal, md); + } + + while (!list_empty (&ni->ni_active_eqs)) { + lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, + lib_eq_t, eq_list); + + CERROR ("Active eq %p on exit\n", eq); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + } + + while (!list_empty (&ni->ni_active_msgs)) { + lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, + lib_msg_t, msg_list); + + CERROR ("Active msg %p on exit\n", msg); + list_del (&msg->msg_list); + lib_msg_free (nal, msg); + } + + nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + ni->up = 0; + + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + + out: + return (PTL_OK); +} diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c new file mode 100644 index 0000000..d171050 --- /dev/null +++ b/lnet/lnet/lib-md.c @@ -0,0 +1,412 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-md.c + * Memory Descriptor management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +/* + * must be called with state lock held + */ +void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +{ + lib_me_t *me = md->me; + + if (md->pending != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + md->md_flags |= PTL_MD_FLAG_UNLINK; + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if ((md->options & PTL_MD_KIOV) != 0) { + if (nal->cb_unmap_pages != NULL) + nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->cb_unmap != NULL) + nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, + &md->md_addrkey); + + if (me) { + me->md = NULL; + if (me->unlink == PTL_UNLINK) + lib_me_unlink(nal, me); + } + + if (md->eq != NULL) + { + md->eq->eq_refcount--; + LASSERT (md->eq->eq_refcount >= 0); + } + + lib_invalidate_handle (nal, &md->md_lh); + list_del (&md->md_list); + lib_md_free(nal, md); +} + +/* must be called with state lock held */ +static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, + ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +{ + const int max_size_opts = PTL_MD_AUTO_UNLINK | + PTL_MD_MAX_SIZE; + lib_eq_t *eq = NULL; + int rc; + int i; + + /* NB we are passes an allocated, but uninitialised/active md. + * if we return success, caller may lib_md_unlink() it. + * otherwise caller may only lib_md_free() it. + */ + + if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) { + eq = ptl_handle2eq(eqh, nal); + if (eq == NULL) + return PTL_INV_EQ; + } + + if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ + md->niov > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_TOO_MANY; + + if ((md->options & max_size_opts) != 0 && /* max size used */ + (md->max_size < 0 || md->max_size > md->length)) // illegal max_size + return PTL_INV_MD; + + new->me = NULL; + new->start = md->start; + new->length = md->length; + new->offset = 0; + new->max_size = md->max_size; + new->unlink = unlink; + new->options = md->options; + new->user_ptr = md->user_ptr; + new->eq = eq; + new->threshold = md->threshold; + new->pending = 0; + new->md_flags = 0; + + if ((md->options & PTL_MD_IOV) != 0) { + int total_length = 0; + + if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.iov, md->start, + md->niov * sizeof (new->md_iov.iov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the base address on trust */ + if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_VAL_FAILED; + + total_length += new->md_iov.iov[i].iov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } else if ((md->options & PTL_MD_KIOV) != 0) { +#ifndef __KERNEL__ + return PTL_INV_MD; +#else + int total_length = 0; + + /* Trap attempt to use paged I/O if unsupported early. */ + if (nal->cb_send_pages == NULL || + nal->cb_recv_pages == NULL) + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, + md->niov * sizeof (new->md_iov.kiov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the page pointer on trust */ + if (new->md_iov.kiov[i].kiov_offset + + new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + return PTL_VAL_FAILED; /* invalid length */ + + total_length += new->md_iov.kiov[i].kiov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map_pages != NULL) { + rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } +#endif + } else { /* contiguous */ + new->md_niov = 1; + new->md_iov.iov[0].iov_base = md->start; + new->md_iov.iov[0].iov_len = md->length; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } + + if (eq != NULL) + eq->eq_refcount++; + + /* It's good; let handle2md succeed and add to active mds */ + lib_initialise_handle (nal, &new->md_lh); + list_add (&new->md_list, &nal->ni.ni_active_mds); + + return PTL_OK; +} + +/* must be called with state lock held */ +void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + new->start = md->start; + new->length = md->length; + new->threshold = md->threshold; + new->max_size = md->max_size; + new->options = md->options; + new->user_ptr = md->user_ptr; + ptl_eq2handle(&new->eventq, md->eq); + new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov; +} + +int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_me_t current_in + * ptl_md_t md_in + * ptl_unlink_t unlink_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDAttach_in *args = v_args; + PtlMDAttach_out *ret = v_ret; + lib_me_t *me; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->me_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else if (me->md != NULL) { + ret->rc = PTL_INUSE; + } else { + ret->rc = lib_md_build(nal, md, private, &args->md_in, + &args->eq_in, args->unlink_in); + + if (ret->rc == PTL_OK) { + me->md = md; + md->me = me; + + ptl_md2handle(&ret->handle_out, md); + + state_unlock (nal, &flags); + return (PTL_OK); + } + } + + lib_md_free (nal, md); + + state_unlock (nal, &flags); + return (ret->rc); +} + +int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_md_t md_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDBind_in *args = v_args; + PtlMDBind_out *ret = v_ret; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + ret->rc = lib_md_build(nal, md, private, + &args->md_in, &args->eq_in, PTL_UNLINK); + + if (ret->rc == PTL_OK) { + ptl_md2handle(&ret->handle_out, md); + + state_unlock(nal, &flags); + return (PTL_OK); + } + + lib_md_free (nal, md); + + state_unlock(nal, &flags); + return (ret->rc); +} + +int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMDUnlink_in *args = v_args; + PtlMDUnlink_out *ret = v_ret; + + lib_md_t *md; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + } else if (md->pending != 0) { /* being filled/spilled */ + ret->rc = PTL_MD_INUSE; + } else { + /* Callers attempting to unlink a busy MD which will get + * unlinked once the net op completes should see INUSE, + * before completion and INV_MD thereafter. LASSERT we've + * got that right... */ + LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); + + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_md_t * old_inout + * ptl_md_t * new_inout + * ptl_handle_eq_t testq_in + * ptl_seq_t sequence_in + * + * Outgoing: + * ptl_md_t * old_inout + * ptl_md_t * new_inout + */ + PtlMDUpdate_internal_in *args = v_args; + PtlMDUpdate_internal_out *ret = v_ret; + lib_md_t *md; + lib_eq_t *test_eq = NULL; + ptl_md_t *new = &args->new_inout; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (args->old_inout_valid) + lib_md_deconstruct(nal, md, &ret->old_inout); + + if (!args->new_inout_valid) { + ret->rc = PTL_OK; + goto out; + } + + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(&args->testq_in, nal); + if (test_eq == NULL) { + ret->rc = PTL_INV_EQ; + goto out; + } + } + + if (md->pending != 0) { + ret->rc = PTL_NOUPDATE; + goto out; + } + + if (test_eq == NULL || + test_eq->sequence == args->sequence_in) { + lib_me_t *me = md->me; + +#warning this does not track eq refcounts properly + + ret->rc = lib_md_build(nal, md, private, + new, &new->eventq, md->unlink); + + md->me = me; + } else { + ret->rc = PTL_NOUPDATE; + } + + out: + state_unlock(nal, &flags); + return (ret->rc); +} diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c new file mode 100644 index 0000000..34fb606 --- /dev/null +++ b/lnet/lnet/lib-me.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-me.c + * Match Entry management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); + +int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEAttach_in *args = v_args; + PtlMEAttach_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_ptl_t *tbl = &ni->tbl; + unsigned long flags; + lib_me_t *me; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + /* Should check for valid matchid, but not yet */ + if (0) + return ret->rc = PTL_INV_PROC; + + me = lib_me_alloc (nal); + if (me == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me->match_id = args->match_id_in; + me->match_bits = args->match_bits_in; + me->ignore_bits = args->ignore_bits_in; + me->unlink = args->unlink_in; + me->md = NULL; + + lib_initialise_handle (nal, &me->me_lh); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + else + list_add(&me->me_list, &(tbl->tbl[args->index_in])); + + ptl_me2handle(&ret->handle_out, me); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEInsert_in *args = v_args; + PtlMEInsert_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + lib_me_t *new; + + new = lib_me_alloc (nal); + if (new == NULL) + return (ret->rc = PTL_NOSPACE); + + /* Should check for valid matchid, but not yet */ + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + lib_me_free (nal, new); + + state_unlock (nal, &flags); + return (ret->rc = PTL_INV_ME); + } + + new->match_id = args->match_id_in; + new->match_bits = args->match_bits_in; + new->ignore_bits = args->ignore_bits_in; + new->unlink = args->unlink_in; + new->md = NULL; + + lib_initialise_handle (nal, &new->me_lh); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&new->me_list, &me->me_list); + else + list_add(&new->me_list, &me->me_list); + + ptl_me2handle(&ret->handle_out, new); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEUnlink_in *args = v_args; + PtlMEUnlink_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_unlink(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +/* call with state_lock please */ +void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->debug & PTL_DEBUG_UNLINK) { + ptl_handle_any_t handle; + ptl_me2handle(&handle, me); + } + + list_del (&me->me_list); + + if (me->md) { + me->md->me = NULL; + lib_md_unlink(nal, me->md); + } + + lib_invalidate_handle (nal, &me->me_lh); + lib_me_free(nal, me); +} + +int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlTblDump_in *args = v_args; + PtlTblDump_out *ret = v_ret; + lib_ptl_t *tbl = &nal->ni.tbl; + ptl_handle_any_t handle; + struct list_head *tmp; + unsigned long flags; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + nal->cb_printf(nal, "Portal table index %d\n", args->index_in); + + state_lock(nal, &flags); + list_for_each(tmp, &(tbl->tbl[args->index_in])) { + lib_me_t *me = list_entry(tmp, lib_me_t, me_list); + ptl_me2handle(&handle, me); + lib_me_dump(nal, me); + } + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEDump_in *args = v_args; + PtlMEDump_out *ret = v_ret; + lib_me_t *me; + unsigned long flags; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_dump(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return ret->rc; +} + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) +{ + nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); + + nal->cb_printf(nal, "\tMD\t= %p\n", me->md); + nal->cb_printf(nal, "\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + nal->cb_printf(nal, "\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); +} diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c new file mode 100644 index 0000000..7ba1664 --- /dev/null +++ b/lnet/lnet/lib-move.c @@ -0,0 +1,1287 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif +#include +#include +#include + +/* + * Right now it does not check access control lists. + * + * We only support one MD per ME, which is how the Portals 3.1 spec is written. + * All previous complication is removed. + */ + +static lib_me_t * +lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, + ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, + ptl_match_bits_t match_bits, ptl_size_t *mlength_out, + ptl_size_t *offset_out, int *unlink_out) +{ + lib_ni_t *ni = &nal->ni; + struct list_head *match_list = &ni->tbl.tbl[index]; + struct list_head *tmp; + lib_me_t *me; + lib_md_t *md; + ptl_size_t mlength; + ptl_size_t offset; + + ENTRY; + + CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " + "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); + + if (index < 0 || index >= ni->tbl.size) { + CERROR("Invalid portal %d not in [0-%d]\n", + index, ni->tbl.size); + goto failed; + } + + list_for_each (tmp, match_list) { + me = list_entry(tmp, lib_me_t, me_list); + md = me->md; + + /* ME attached but MD not attached yet */ + if (md == NULL) + continue; + + LASSERT (me == md->me); + + /* MD deactivated */ + if (md->threshold == 0) + continue; + + /* mismatched MD op */ + if ((md->options & op_mask) == 0) + continue; + + /* mismatched ME nid/pid? */ + if (me->match_id.nid != PTL_NID_ANY && + me->match_id.nid != src_nid) + continue; + + if (me->match_id.pid != PTL_PID_ANY && + me->match_id.pid != src_pid) + continue; + + /* mismatched ME matchbits? */ + if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) + continue; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) + offset = md->offset; + else + offset = roffset; + + mlength = md->length - offset; + if ((md->options & PTL_MD_MAX_SIZE) != 0 && + mlength > md->max_size) + mlength = md->max_size; + + if (rlength <= mlength) { /* fits in allowed space */ + mlength = rlength; + } else if ((md->options & PTL_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet %d too big: %d left, " + "%d allowed\n", rlength, md->length - offset, + mlength); + goto failed; + } + + md->offset = offset + mlength; + + *offset_out = offset; + *mlength_out = mlength; + *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 && + md->offset >= (md->length - md->max_size)); + RETURN (me); + } + + failed: + CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 + " offset %d length %d: no match\n", + ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + src_nid, src_pid, index, match_bits, roffset, rlength); + RETURN(NULL); +} + +int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +{ + PtlFailNid_in *args = v_args; + PtlFailNid_out *ret = v_ret; + lib_test_peer_t *tp; + unsigned long flags; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + if (args->threshold != 0) { + /* Adding a new entry */ + tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + if (tp == NULL) + return (ret->rc = PTL_FAIL); + + tp->tp_nid = args->nid; + tp->tp_threshold = args->threshold; + + state_lock (nal, &flags); + list_add (&tp->tp_list, &nal->ni.ni_test_peers); + state_unlock (nal, &flags); + return (ret->rc = PTL_OK); + } + + /* removing entries */ + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + args->nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == args->nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + + list_del (&tp->tp_list); + nal->cb_free (nal, tp, sizeof (*tp)); + } + return (ret->rc = PTL_OK); +} + +static int +fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +{ + lib_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + unsigned long flags; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != PTL_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + list_del (&tp->tp_list); + + nal->cb_free (nal, tp, sizeof (*tp)); + } + + return (fail); +} + +ptl_size_t +lib_iov_nob (int niov, struct iovec *iov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} + +void +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (dest, iov->iov_base, nob); + + len -= nob; + dest += nob; + niov--; + iov++; + } +} + +void +lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (iov->iov_base, src, nob); + + len -= nob; + src += nob; + niov--; + iov++; + } +} + +static int +lib_extract_iov (struct iovec *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + struct iovec *src = md->md_iov.iov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (dst_niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} + +#ifndef __KERNEL__ +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + LASSERT (0); + return (0); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + LASSERT (0); +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +{ + LASSERT (0); +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + LASSERT (0); +} + +#else + +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (dest, addr, nob); + kunmap (kiov->kiov_page); + + len -= nob; + dest += nob; + niov--; + kiov++; + } +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (addr, src, nob); + kunmap (kiov->kiov_page); + + len -= nob; + src += nob; + niov--; + kiov++; + } +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + ptl_kiov_t *src = md->md_iov.kiov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + return (dst_niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} +#endif + +void +lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +{ + int niov; + + if (mlen == 0) + nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); + else if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); + nal->cb_recv (nal, private, msg, + niov, msg->msg_iov.iov, mlen, rlen); + } else { + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); + nal->cb_recv_pages (nal, private, msg, + niov, msg->msg_iov.kiov, mlen, rlen); + } +} + +int +lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len) +{ + int niov; + + if (len == 0) + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + 0, NULL, 0)); + + if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.iov, len)); + } + + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); + return (nal->cb_send_pages (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.kiov, len)); +} + +static lib_msg_t * +get_new_msg (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called holding the state_lock */ + lib_counters_t *counters = &nal->ni.counters; + lib_msg_t *msg = lib_msg_alloc (nal); + + if (msg == NULL) + return (NULL); + + memset (msg, 0, sizeof (*msg)); + + msg->send_ack = 0; + + msg->md = md; + msg->ev.arrival_time = get_cycles(); + md->pending++; + if (md->threshold != PTL_MD_THRESH_INF) { + LASSERT (md->threshold > 0); + md->threshold--; + } + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; + + list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + + return (msg); +} + + +/* + * Incoming messages have a ptl_msg_t object associated with them + * by the library. This object encapsulates the state of the + * message and allows the NAL to do non-blocking receives or sends + * of long messages. + * + */ +static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + unsigned long flags; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); + hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, + hdr->src_nid, hdr->src_pid, + PTL_HDR_LENGTH (hdr), hdr->msg.put.offset, + hdr->msg.put.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " + "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + !(md->options & PTL_MD_ACK_DISABLE)) { + msg->send_ack = 1; + msg->ack_wmd = hdr->msg.put.ack_wmd; + msg->nid = hdr->src_nid; + msg->pid = hdr->src_pid; + msg->ev.match_bits = hdr->msg.put.match_bits; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += mlength; + + /* only unlink after MD's pending count has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + ptl_hdr_t reply; + unsigned long flags; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); + hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); + + /* compatibility check until field is deleted */ + if (hdr->msg.get.return_offset != 0) + CERROR("Unexpected non-zero get.return_offset %x from " + LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, + hdr->src_nid, hdr->src_pid, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " + "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.send_count++; + ni->counters.send_length += mlength; + + /* only unlink after MD's refcount has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + memset (&reply, 0, sizeof (reply)); + reply.type = HTON__u32 (PTL_MSG_REPLY); + reply.dest_nid = HTON__u64 (hdr->src_nid); + reply.src_nid = HTON__u64 (ni->nid); + reply.dest_pid = HTON__u32 (hdr->src_pid); + reply.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength); + + reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + + rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, + hdr->src_nid, hdr->src_pid, md, offset, mlength); + if (rc != 0) { + CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", + ni->nid, hdr->src_nid); + state_lock (nal, &flags); + goto drop; + } + + /* Complete the incoming message */ + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (rc); + drop: + ni->counters.drop_count++; + ni->counters.drop_length += hdr->msg.get.sink_length; + state_unlock(nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + int rlength; + int length; + lib_msg_t *msg; + unsigned long flags; + + /* compatibility check until field is deleted */ + if (hdr->msg.reply.dst_offset != 0) + CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n", + hdr->msg.reply.dst_offset, hdr->src_nid); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", + ni->nid, hdr->src_nid, + md == NULL ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + goto drop; + } + + LASSERT (md->offset == 0); + + length = rlength = PTL_HDR_LENGTH(hdr); + + if (length > md->length) { + if ((md->options & PTL_MD_TRUNCATE) == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64 + " length %d for MD "LPX64" would overflow (%d)\n", + ni->nid, hdr->src_nid, length, + hdr->msg.reply.dst_wmd.wh_object_cookie, + md->length); + goto drop; + } + length = md->length; + } + + CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", + hdr->src_nid, length, rlength, + hdr->msg.reply.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping REPLY from "LPU64": can't " + "allocate msg\n", ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += length; + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, 0, length, rlength); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + unsigned long flags; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR(LPU64": Dropping ACK from "LPU64" to %s MD " + LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + goto drop; + } + + CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", + ni->nid, hdr->src_nid, + hdr->msg.ack.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + state_unlock(nal, &flags); + lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + case PTL_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} + +void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); + nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, + hdr->src_pid); + nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, + hdr->dest_pid); + + switch (hdr->type) { + default: + break; + + case PTL_MSG_PUT: + nal->cb_printf(nal, + " Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + nal->cb_printf(nal, + " Length %d, offset %d, hdr data "LPX64"\n", + PTL_HDR_LENGTH(hdr), hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + nal->cb_printf(nal, + " Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + nal->cb_printf(nal, + " Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case PTL_MSG_ACK: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case PTL_MSG_REPLY: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + PTL_HDR_LENGTH(hdr)); + } + +} /* end of print_hdr() */ + + +int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + unsigned long flags; + + /* NB static check; optimizer will elide this if it's right */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.put.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.get.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.reply.length)); + + /* convert common fields to host byte order */ + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->src_nid = NTOH__u64 (hdr->src_nid); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); + hdr->src_pid = NTOH__u32 (hdr->src_pid); + hdr->type = NTOH__u32 (hdr->type); + PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr)); +#if 0 + nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", + nal->ni.nid, nal, hdr, hdr->type); + print_hdr(nal, hdr); +#endif + if (hdr->type == PTL_MSG_HELLO) { + /* dest_nid is really ptl_magicversion_t */ + ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; + + CERROR (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->ni.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (hdr->dest_nid != nal->ni.nid) { + CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 + " (not me)\n", nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + + state_lock (nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ + { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": simulated failure\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + return (-1); + } + + switch (hdr->type) { + case PTL_MSG_ACK: + return (parse_ack(nal, hdr, private)); + case PTL_MSG_PUT: + return (parse_put(nal, hdr, private)); + break; + case PTL_MSG_GET: + return (parse_get(nal, hdr, private)); + break; + case PTL_MSG_REPLY: + return (parse_reply(nal, hdr, private)); + break; + default: + CERROR(LPU64": Dropping message from "LPU64 + ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, + hdr->type); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } +} + + +int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_ack_req_t ack_req_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlPut_in *args = v_args; + PtlPut_out *ret = v_ret; + ptl_hdr_t hdr; + + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + ptl_process_id_t *id = &args->target_in; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + ret->rc = PTL_OK; + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_PUT); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length); + + /* NB handles only looked up by creator (no flips) */ + if (args->ack_req_in == PTL_ACK_REQ) { + hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; + } else { + hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; + } + + hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.put.offset = HTON__u32 (args->offset_in); + hdr.msg.put.hdr_data = args->hdr_data_in; + + ni->counters.send_count++; + ni->counters.send_length += md->length; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("BAD: could not allocate msg!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we need to allocate a message state object and record the + * information about this operation that will be recorded into + * event queue once the message has been completed. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + + return ret->rc = PTL_OK; +} + + +int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlGet_in *args = v_args; + PtlGet_out *ret = v_ret; + ptl_hdr_t hdr; + lib_msg_t *msg = NULL; + lib_ni_t *ni = &nal->ni; + ptl_process_id_t *id = &args->target_in; + lib_md_t *md; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + LASSERT (md->offset == 0); + + CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_GET); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = 0; + + /* NB handles only looked up by creator (no flips) */ + hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; + + hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.sink_length = HTON__u32 (md->length); + + ni->counters.send_count++; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we must allocate a message state object that will record + * the information to be filled in once the message has been + * completed. More information is in the do_PtlPut() comments. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + + return ret->rc = PTL_OK; +} diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c new file mode 100644 index 0000000..20a6c66 --- /dev/null +++ b/lnet/lnet/lib-msg.c @@ -0,0 +1,163 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-msg.c + * Message decoding, parsing and finalizing routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include + +int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +{ + lib_md_t *md; + lib_eq_t *eq; + int rc; + unsigned long flags; + + /* ni went down while processing this message */ + if (nal->ni.up == 0) { + return -1; + } + + if (msg == NULL) + return 0; + + rc = 0; + if (msg->send_ack) { + ptl_hdr_t ack; + + LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + + memset (&ack, 0, sizeof (ack)); + ack.type = HTON__u32 (PTL_MSG_ACK); + ack.dest_nid = HTON__u64 (msg->nid); + ack.src_nid = HTON__u64 (nal->ni.nid); + ack.dest_pid = HTON__u32 (msg->pid); + ack.src_pid = HTON__u32 (nal->ni.pid); + PTL_HDR_LENGTH(&ack) = 0; + + ack.msg.ack.dst_wmd = msg->ack_wmd; + ack.msg.ack.match_bits = msg->ev.match_bits; + ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); + + rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, + msg->nid, msg->pid, NULL, 0, 0); + } + + md = msg->md; + LASSERT (md->pending > 0); /* I've not dropped my ref yet */ + eq = md->eq; + + state_lock(nal, &flags); + + if (eq != NULL) { + ptl_event_t *ev = &msg->ev; + ptl_event_t *eq_slot; + + /* I have to hold the lock while I bump the sequence number + * and copy the event into the queue. If not, and I was + * interrupted after bumping the sequence number, other + * events could fill the queue, including the slot I just + * allocated to this event. On resuming, I would overwrite + * a more 'recent' event with old event state, and + * processes taking events off the queue would not detect + * overflow correctly. + */ + + ev->sequence = eq->sequence++;/* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Invalidate unlinked_me unless this is the last + * event for an auto-unlinked MD. Note that if md was + * auto-unlinked, md->pending can only decrease + */ + if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ + md->pending != 1) /* not last ref */ + ev->unlinked_me = PTL_HANDLE_NONE; + + /* Copy the event into the allocated slot, ensuring all the + * rest of the event's contents have been copied _before_ + * the sequence number gets updated. A processes 'getting' + * an event waits on the next queue slot's sequence to be + * 'new'. When it is, _all_ other event fields had better + * be consistent. I assert 'sequence' is the last member, + * so I only need a 2 stage copy. + */ + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' */ + + /* cb_write is not necessarily atomic, so this could + cause a race with PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + + /* I must also ensure that (a) callbacks are made in the + * same order as the events land in the queue, and (b) the + * callback occurs before the event can be removed from the + * queue, so I can't drop the lock during the callback. */ + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + (void)((eq->event_callback) (ev)); + } + + LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + + md->pending--; + if (md->pending == 0 && /* no more outstanding operations on this md */ + (md->threshold == 0 || /* done its business */ + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + lib_md_unlink(nal, md); + + list_del (&msg->msg_list); + nal->ni.counters.msgs_alloc--; + lib_msg_free(nal, msg); + + state_unlock(nal, &flags); + + return rc; +} diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c new file mode 100644 index 0000000..37dcb91 --- /dev/null +++ b/lnet/lnet/lib-ni.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-ni.c + * Network status registers and distance functions. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +#define MAX_DIST 18446744073709551615UL + +int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlNIDebug_in *args = v_args; + PtlNIDebug_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->rc = ni->debug; + ni->debug = args->mask_in; + + return 0; +} + +int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_sr_index_t register_in + * + * Outgoing: + * ptl_sr_value_t * status_out + */ + + PtlNIStatus_in *args = v_args; + PtlNIStatus_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_counters_t *count = &ni->counters; + + if (!args) + return ret->rc = PTL_SEGV; + + ret->rc = PTL_OK; + ret->status_out = 0; + + /* + * I hate this sort of code.... Hash tables, offset lists? + * Treat the counters as an array of ints? + */ + if (args->register_in == PTL_SR_DROP_COUNT) + ret->status_out = count->drop_count; + + else if (args->register_in == PTL_SR_DROP_LENGTH) + ret->status_out = count->drop_length; + + else if (args->register_in == PTL_SR_RECV_COUNT) + ret->status_out = count->recv_count; + + else if (args->register_in == PTL_SR_RECV_LENGTH) + ret->status_out = count->recv_length; + + else if (args->register_in == PTL_SR_SEND_COUNT) + ret->status_out = count->send_count; + + else if (args->register_in == PTL_SR_SEND_LENGTH) + ret->status_out = count->send_length; + + else if (args->register_in == PTL_SR_MSGS_MAX) + ret->status_out = count->msgs_max; + else + ret->rc = PTL_INV_SR_INDX; + + return ret->rc; +} + + +int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_process_id_t process_in + + * + * Outgoing: + * unsigned long * distance_out + + */ + + PtlNIDist_in *args = v_args; + PtlNIDist_out *ret = v_ret; + + unsigned long dist; + ptl_process_id_t id_in = args->process_in; + ptl_nid_t nid; + int rc; + + nid = id_in.nid; + + if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { + ret->distance_out = (unsigned long) MAX_DIST; + return PTL_INV_PROC; + } + + ret->distance_out = dist; + + return ret->rc = PTL_OK; +} diff --git a/lnet/lnet/lib-not-impl.c b/lnet/lnet/lib-not-impl.c new file mode 100644 index 0000000..78959b2 --- /dev/null +++ b/lnet/lnet/lib-not-impl.c @@ -0,0 +1,37 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-not-impl.c + * + * boiler plate functions that can be used to write the + * library side routines + */ + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + + +int do_PtlACEntry(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_ac_index_t index_in + * ptl_process_id_t match_id_in + * ptl_pt_index_t portal_in + + * + * Outgoing: + + */ + + PtlACEntry_in *args = v_args; + PtlACEntry_out *ret = v_ret; + + if (!args) + return ret->rc = PTL_SEGV; + + return ret->rc = PTL_NOT_IMPLEMENTED; +} diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c new file mode 100644 index 0000000..e00e9f0 --- /dev/null +++ b/lnet/lnet/lib-pid.c @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-pid.c + * Process identification routines + */ + +/* This should be removed. The NAL should have the PID information */ +#define DEBUG_SUBSYSTEM S_PORTALS + +#if defined (__KERNEL__) +# include +extern int getpid(void); +#else +# include +# include +#endif +#include +#include + +int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t handle_in + * + * Outgoing: + * ptl_process_id_t * id_out + * ptl_id_t * gsize_out + */ + + PtlGetId_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->id_out.nid = ni->nid; + ret->id_out.pid = ni->pid; + + return ret->rc = PTL_OK; +} diff --git a/lnet/packaging/.cvsignore b/lnet/packaging/.cvsignore new file mode 100644 index 0000000..fd1d56a --- /dev/null +++ b/lnet/packaging/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +aclocal.m4 +config.log +config.status +config.cache +configure +portals.spec diff --git a/lnet/packaging/Makefile.am b/lnet/packaging/Makefile.am new file mode 100644 index 0000000..126bc69 --- /dev/null +++ b/lnet/packaging/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = portals.spec \ No newline at end of file diff --git a/lnet/packaging/portals.spec.in b/lnet/packaging/portals.spec.in new file mode 100644 index 0000000..e196b3f --- /dev/null +++ b/lnet/packaging/portals.spec.in @@ -0,0 +1,116 @@ +%define kversion @RELEASE@ +%define linuxdir @LINUX@ +%define version HEAD + +Summary: Sandia Portals Message Passing - utilities +Name: portals +Version: %{version} +Release: 0210101748uml +Copyright: LGPL +Group: Utilities/System +BuildRoot: /var/tmp/portals-%{version}-root +Source: http://sandiaportals.org/portals-%{version}.tar.gz + +%description +Sandia Portals message passing package. Contains kernel modules, libraries and utilities. + +%package -n portals-modules +Summary: Kernel modules and NAL's for portals +Group: Development/Kernel + +%description -n portals-modules +Object-Based Disk storage drivers for Linux %{kversion}. + +%package -n portals-source +Summary: Portals kernel source for rebuilding with other kernels +Group: Development/Kernel + +%description -n portals-source +Portals kernel source for rebuilding with other kernels + +%prep +%setup -n portals-%{version} + +%build +rm -rf $RPM_BUILD_ROOT + +# Create the pristine source directory. +srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version} +mkdir -p $srcdir +find . -name CVS -prune -o -print | cpio -ap $srcdir + +# Set an explicit path to our Linux tree, if we can. +conf_flag= +linuxdir=%{linuxdir} +test -d $linuxdir && conf_flag=--with-linux=$linuxdir +./configure $conf_flag +make + +%install +make install prefix=$RPM_BUILD_ROOT + +%ifarch alpha +# this hurts me + conf_flag= + linuxdir=%{linuxdir} + test -d $linuxdir && conf_flag=--with-linux=$linuxdir + make clean + ./configure --enable-rtscts-myrinet $conf_flag + make + cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o + cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload +%endif + + +%files +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /usr/sbin/acceptor +%attr(-, root, root) /usr/sbin/ptlctl +%attr(-, root, root) /usr/sbin/debugctl +%ifarch alpha +%attr(-, root, root) /usr/sbin/mcpload +%endif +%attr(-, root, root) /lib/libmyrnal.a +%attr(-, root, root) /lib/libptlapi.a +%attr(-, root, root) /lib/libptlctl.a +%attr(-, root, root) /lib/libprocbridge.a +%attr(-, root, root) /lib/libptllib.a +%attr(-, root, root) /lib/libtcpnal.a +%attr(-, root, root) /lib/libtcpnalutil.a +%attr(-, root, root) /usr/include/portals/*.h +%attr(-, root, root) /usr/include/portals/base/*.h +%attr(-, root, root) /usr/include/linux/*.h + +%files -n portals-modules +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o +%ifarch alpha +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o +%endif +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o + +%files -n portals-source +%attr(-, root, root) /usr/src/portals-%{version} + +%post +if [ ! -e /dev/portals ]; then + mknod /dev/portals c 10 240 +fi +depmod -ae || exit 0 + +grep -q portals /etc/modules.conf || \ + echo 'alias char-major-10-240 portals' >> /etc/modules.conf + +grep -q '/dev/portals' /etc/modules.conf || \ + echo 'alias /dev/portals portals' >> /etc/modules.conf + +%postun +depmod -ae || exit 0 + +%clean +#rm -rf $RPM_BUILD_ROOT + +# end of file diff --git a/lnet/router/Makefile.am b/lnet/router/Makefile.am new file mode 100644 index 0000000..1c8087b --- /dev/null +++ b/lnet/router/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +MODULE = kptlrouter +modulenet_DATA = kptlrouter.o +EXTRA_PROGRAMS = kptlrouter + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kptlrouter_SOURCES = router.c proc.c router.h diff --git a/lnet/router/Makefile.mk b/lnet/router/Makefile.mk new file mode 100644 index 0000000..64bd09b --- /dev/null +++ b/lnet/router/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += kptlrouter.o +kptlrouter-objs := router.o proc.o diff --git a/lnet/router/proc.c b/lnet/router/proc.c new file mode 100644 index 0000000..dd65b34 --- /dev/null +++ b/lnet/router/proc.c @@ -0,0 +1,78 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +#define KPR_PROC_ROUTER "sys/portals/router" + +int +kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data) +{ + unsigned long long bytes = kpr_fwd_bytes; + unsigned long packets = kpr_fwd_packets; + unsigned long errors = kpr_fwd_errors; + unsigned int qdepth = atomic_read (&kpr_queue_depth); + int len; + + *eof = 1; + if (off != 0) + return (0); + + len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); + + *start = page; + return (len); +} + +int +kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data) +{ + /* Ignore what we've been asked to write, and just zero the stats counters */ + kpr_fwd_bytes = 0; + kpr_fwd_packets = 0; + kpr_fwd_errors = 0; + + return (count); +} + +void +kpr_proc_init(void) +{ + struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL); + + if (entry == NULL) + { + CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); + return; + } + + entry->data = NULL; + entry->read_proc = kpr_proc_read; + entry->write_proc = kpr_proc_write; +} + +void +kpr_proc_fini(void) +{ + remove_proc_entry(KPR_PROC_ROUTER, 0); +} diff --git a/lnet/router/router.c b/lnet/router/router.c new file mode 100644 index 0000000..8a1de08 --- /dev/null +++ b/lnet/router/router.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +struct list_head kpr_routes; +struct list_head kpr_nals; + +unsigned long long kpr_fwd_bytes; +unsigned long kpr_fwd_packets; +unsigned long kpr_fwd_errors; +atomic_t kpr_queue_depth; + +/* Mostly the tables are read-only (thread and interrupt context) + * + * Once in a blue moon we register/deregister NALs and add/remove routing + * entries (thread context only)... */ +rwlock_t kpr_rwlock; + +kpr_router_interface_t kpr_router_interface = { + kprri_register: kpr_register_nal, + kprri_lookup: kpr_lookup_target, + kprri_fwd_start: kpr_forward_packet, + kprri_fwd_done: kpr_complete_packet, + kprri_shutdown: kpr_shutdown_nal, + kprri_deregister: kpr_deregister_nal, +}; + +kpr_control_interface_t kpr_control_interface = { + kprci_add_route: kpr_add_route, + kprci_del_route: kpr_del_route, + kprci_get_route: kpr_get_route, +}; + +int +kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) +{ + long flags; + struct list_head *e; + kpr_nal_entry_t *ne; + + CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + + PORTAL_ALLOC (ne, sizeof (*ne)); + if (ne == NULL) + return (-ENOMEM); + + memset (ne, 0, sizeof (*ne)); + memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); + + LASSERT (!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) + { + write_unlock_irqrestore (&kpr_rwlock, flags); + + CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid); + + PORTAL_FREE (ne, sizeof (*ne)); + return (-EEXIST); + } + } + + list_add (&ne->kpne_list, &kpr_nals); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + *argp = ne; + PORTAL_MODULE_USE; + return (0); +} + +void +kpr_shutdown_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (!ne->kpne_shutdown); + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */ + ne->kpne_shutdown = 1; + write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */ + + while (atomic_read (&ne->kpne_refcount) != 0) + { + CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", + ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); + + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } +} + +void +kpr_deregister_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ + LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); + + list_del (&ne->kpne_list); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (ne, sizeof (*ne)); + PORTAL_MODULE_UNUSE; +} + + +int +kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + int rc = -ENOENT; + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + + if (ne->kpne_shutdown) /* caller is shutting down */ + return (-ENOENT); + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid on the callers network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || + re->kpre_hi_nid < target_nid) + continue; + + /* found table entry */ + + if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ + rc = -EHOSTUNREACH; + else + { + rc = 0; + *gateway_nidp = re->kpre_gateway_nid; + } + break; + } + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + target_nid, ne->kpne_interface.kprni_nalid, rc, + (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); + return (rc); +} + +void +kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + struct list_head *e; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ + LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + + atomic_inc (&kpr_queue_depth); + + kpr_fwd_packets++; /* (loose) stats accounting */ + kpr_fwd_bytes += nob; + + if (src_ne->kpne_shutdown) /* caller is shutting down */ + goto out; + + fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ + atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || /* no match */ + re->kpre_hi_nid < target_nid) + continue; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + re->kpre_gateway_nid, re->kpre_gateway_nalid); + + if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) + break; /* don't route to same NAL */ + + /* Search for gateway's NAL's entry */ + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ + break; + + fwd->kprfd_gateway_nid = re->kpre_gateway_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; + } + break; + } + + read_unlock (&kpr_rwlock); + out: + kpr_fwd_errors++; + + CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + /* Can't find anywhere to forward to */ + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); +} + +void +kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) +{ + kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; + + CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ + + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); + + CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ +} + +int +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + long flags; + struct list_head *e; + kpr_route_entry_t *re; + + CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + gateway_nalid, gateway_nid, lo_nid, hi_nid); + + LASSERT(lo_nid <= hi_nid); + + PORTAL_ALLOC (re, sizeof (*re)); + if (re == NULL) + return (-ENOMEM); + + re->kpre_gateway_nalid = gateway_nalid; + re->kpre_gateway_nid = gateway_nid; + re->kpre_lo_nid = lo_nid; + re->kpre_hi_nid = hi_nid; + + LASSERT(!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > re2->kpre_hi_nid || + re->kpre_hi_nid < re2->kpre_lo_nid) + continue; + + CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" + "to ["LPX64" - "LPX64"]\n", + re->kpre_lo_nid, re->kpre_hi_nid, + re2->kpre_lo_nid, re2->kpre_hi_nid); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (re, sizeof (*re)); + return (-EINVAL); + } + + list_add (&re->kpre_list, &kpr_routes); + + write_unlock_irqrestore (&kpr_rwlock, flags); + return (0); +} + +int +kpr_del_route (ptl_nid_t nid) +{ + long flags; + struct list_head *e; + + CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + + LASSERT(!in_interrupt()); + write_lock_irqsave(&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + continue; + + list_del (&re->kpre_list); + write_unlock_irqrestore(&kpr_rwlock, flags); + + PORTAL_FREE(re, sizeof (*re)); + return (0); + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + return (-ENOENT); +} + +int +kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +{ + struct list_head *e; + + read_lock(&kpr_rwlock); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (idx-- == 0) { + *gateway_nalid = re->kpre_gateway_nalid; + *gateway_nid = re->kpre_gateway_nid; + *lo_nid = re->kpre_lo_nid; + *hi_nid = re->kpre_hi_nid; + + read_unlock(&kpr_rwlock); + return (0); + } + } + + read_unlock (&kpr_rwlock); + return (-ENOENT); +} + +static void __exit +kpr_finalise (void) +{ + LASSERT (list_empty (&kpr_nals)); + + while (!list_empty (&kpr_routes)) { + kpr_route_entry_t *re = list_entry(kpr_routes.next, + kpr_route_entry_t, + kpre_list); + + list_del(&re->kpre_list); + PORTAL_FREE(re, sizeof (*re)); + } + + kpr_proc_fini(); + + PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); + PORTAL_SYMBOL_UNREGISTER(kpr_control_interface); + + CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kpr_initialise (void) +{ + CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", + atomic_read(&portal_kmemory)); + + rwlock_init(&kpr_rwlock); + INIT_LIST_HEAD(&kpr_routes); + INIT_LIST_HEAD(&kpr_nals); + + kpr_proc_init(); + + PORTAL_SYMBOL_REGISTER(kpr_router_interface); + PORTAL_SYMBOL_REGISTER(kpr_control_interface); + return (0); +} + +MODULE_AUTHOR("Eric Barton"); +MODULE_DESCRIPTION("Kernel Portals Router v0.01"); +MODULE_LICENSE("GPL"); + +module_init (kpr_initialise); +module_exit (kpr_finalise); + +EXPORT_SYMBOL (kpr_control_interface); +EXPORT_SYMBOL (kpr_router_interface); diff --git a/lnet/router/router.h b/lnet/router/router.h new file mode 100644 index 0000000..b8c3bec --- /dev/null +++ b/lnet/router/router.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _KPTLROUTER_H +#define _KPTLROUTER_H +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLROUTER + +#include +#include +#include + +typedef struct +{ + struct list_head kpne_list; + kpr_nal_interface_t kpne_interface; + atomic_t kpne_refcount; + int kpne_shutdown; +} kpr_nal_entry_t; + +typedef struct +{ + struct list_head kpre_list; + int kpre_gateway_nalid; + ptl_nid_t kpre_gateway_nid; + ptl_nid_t kpre_lo_nid; + ptl_nid_t kpre_hi_nid; +} kpr_route_entry_t; + +extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_shutdown_nal (void *arg); +extern void kpr_deregister_nal (void *arg); + +extern void kpr_proc_init (void); +extern void kpr_proc_fini (void); + +extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); +extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + +extern unsigned long long kpr_fwd_bytes; +extern unsigned long kpr_fwd_packets; +extern unsigned long kpr_fwd_errors; +extern atomic_t kpr_queue_depth; + +#endif /* _KPLROUTER_H */ diff --git a/lnet/tests/.cvsignore b/lnet/tests/.cvsignore new file mode 100644 index 0000000..051d1bd --- /dev/null +++ b/lnet/tests/.cvsignore @@ -0,0 +1,3 @@ +Makefile +Makefile.in +.deps diff --git a/lnet/tests/Makefile.am b/lnet/tests/Makefile.am new file mode 100644 index 0000000..7b47ae0 --- /dev/null +++ b/lnet/tests/Makefile.am @@ -0,0 +1,23 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r +LINK = $(LD) $(LDFLAGS) -o $@ +DEFS = +LIBS = +MODULE = $(basename) +EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh + +noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o + +pingsrv_o_SOURCES = ping_srv.c ping.h + +pingcli_o_SOURCES = ping_cli.c ping.h + +spingsrv_o_SOURCES = sping_srv.c ping.h + +spingcli_o_SOURCES = sping_cli.c ping.h diff --git a/lnet/tests/ping.h b/lnet/tests/ping.h new file mode 100644 index 0000000..f07444b --- /dev/null +++ b/lnet/tests/ping.h @@ -0,0 +1,80 @@ +#ifndef _KPING_INCLUDED +#define _KPING_INCLUDED + +#include + + +#define PTL_PING_IN_SIZE 256 // n packets per buffer +#define PTL_PING_IN_BUFFERS 2 // n fallback buffers + +#define PTL_PING_CLIENT 4 +#define PTL_PING_SERVER 5 + +#define PING_HEADER_MAGIC 0xDEADBEEF +#define PING_BULK_MAGIC 0xCAFEBABE + +#define PING_HEAD_BITS 0x00000001 +#define PING_BULK_BITS 0x00000002 +#define PING_IGNORE_BITS 0xFFFFFFFC + +#define PTL_PING_ACK 0x01 +#define PTL_PING_VERBOSE 0x02 +#define PTL_PING_VERIFY 0x04 +#define PTL_PING_PREALLOC 0x08 + + +#define NEXT_PRIMARY_BUFFER(index) \ + (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) + +#define PDEBUG(str, err) \ + CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) + + +/* Ping data to be passed via the ioctl to kernel space */ + +#if __KERNEL__ + + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +struct pingsrv_data { + + ptl_handle_ni_t ni; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + void *in_buf; + ptl_process_id_t my_id; + ptl_process_id_t id_local; + ptl_md_t mdin; + ptl_md_t mdout; + ptl_handle_md_t mdin_h; + ptl_handle_md_t mdout_h; + ptl_event_t evnt; + struct task_struct *tsk; +}; /* struct pingsrv_data */ + +struct pingcli_data { + + struct portal_ioctl_data *args; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + char *inbuf; + char *outbuf; + ptl_process_id_t myid; + ptl_process_id_t id_local; + ptl_process_id_t id_remote; + ptl_md_t md_in_head; + ptl_md_t md_out_head; + ptl_handle_md_t md_in_head_h; + ptl_handle_md_t md_out_head_h; + ptl_event_t ev; + struct task_struct *tsk; +}; /* struct pingcli_data */ + + +#endif /* __KERNEL__ */ + +#endif /* _KPING_INCLUDED */ diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c new file mode 100644 index 0000000..389ffbb --- /dev/null +++ b/lnet/tests/ping_cli.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) + +#define MAX_TIME 100000 + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + if ((rc = PtlMDUnlink (client->md_in_head_h))) + PDEBUG ("PtlMDUnlink", rc); + + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + int i, magic; + i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned)); + magic = *(int *)(ev->mem_desc.start + ev->offset); + + if(magic != 0xcafebabe) { + printk ("Unexpected response \n"); + return 1; + } + + if((i == count) || !count) + wake_up_process (client->tsk); + else + printk ("Received response after timeout for %d\n",i); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + unsigned ping_bulk_magic = PING_BULK_MAGIC; + int rc; + struct timeval tv1, tv2; + client->tsk = current; + client->args = args; + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + count = args->ioc_count; + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = (args->ioc_size + STDSIZE) + * count; + client->md_in_head.threshold = PTL_MD_THRESH_INF; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE + args->ioc_size; + client->md_out_head.threshold = args->ioc_count; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); + + count = 0; + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return NULL; + } + while ((args->ioc_count - count)) { + memcpy (client->outbuf + sizeof(unsigned), + &(count), sizeof(unsigned)); + /* Put the ping packet */ + do_gettimeofday (&tv1); + + memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, + sizeof(struct timeval)); + + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + printk ("sent msg no %d", count); + + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" :: timeout .....\n"); + } else { + do_gettimeofday (&tv2); + printk(" :: Reply in %u usec\n", + (unsigned)((tv2.tv_sec - tv1.tv_sec) + * 1000000 + (tv2.tv_usec - tv1.tv_usec))); + } + count++; + } + + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + memset (client, 0, sizeof(struct pingcli_data)); + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c new file mode 100644 index 0000000..1037d09 --- /dev/null +++ b/lnet/tests/ping_srv.c @@ -0,0 +1,308 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) +#define MAXSIZE (16*1024*1024) + +static unsigned ping_head_magic; +static unsigned ping_bulk_magic; +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + case 5: + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, MAXSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + unsigned long magic; + unsigned long ping_bulk_magic = 0xcafebabe; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + magic = *((int *)(server->evnt.mem_desc.start + + server->evnt.offset)); + + + if(magic != 0xdeadbeef) { + printk("Unexpected Packet to the server\n"); + + } + memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); + + server->mdout.length = server->evnt.rlength; + server->mdout.start = server->in_buf; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset)), + *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))), + *((int *)(ev->mem_desc.start + ev->offset + 2 * + sizeof(unsigned)))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "NAL %d not loaded\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, MAXSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + ping_head_magic = PING_HEADER_MAGIC; + ping_bulk_magic = PING_BULK_MAGIC; + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c new file mode 100644 index 0000000..4cef08b --- /dev/null +++ b/lnet/tests/sping_cli.c @@ -0,0 +1,276 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes + assumed */ + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, STDSIZE); + + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + wake_up_process (client->tsk); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + const ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + int rc; + + client->tsk = current; + client->args = args; + + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, STDSIZE); + + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded.\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = STDSIZE; + client->md_in_head.threshold = 1; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, STDSIZE); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE; + client->md_out_head.threshold = 1; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Put the ping packet */ + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + + count = 0; + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" Time out on the server\n"); + pingcli_shutdown (2); + return NULL; + } else + printk("Received respose from the server \n"); + + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + memset (client, 0, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c new file mode 100644 index 0000000..a18ea35 --- /dev/null +++ b/lnet/tests/sping_srv.c @@ -0,0 +1,295 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) + +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#endif + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, STDSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + server->mdout.start = server->in_buf; + server->mdout.length = STDSIZE; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, STDSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh new file mode 100644 index 0000000..c9b7c16 --- /dev/null +++ b/lnet/tests/startclient.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingcli.o +else + PING=spingcli.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +exit 0; diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh new file mode 100644 index 0000000..942300e --- /dev/null +++ b/lnet/tests/startserver.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingsrv.o +else + PING=spingsrv.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING nal=4 + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING nal=2 + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING nal=4 + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +../utils/acceptor 9999& +exit 0; diff --git a/lnet/tests/stopclient.sh b/lnet/tests/stopclient.sh new file mode 100644 index 0000000..f7e3aa1 --- /dev/null +++ b/lnet/tests/stopclient.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingcli +else + PING=pingcli +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +rmmod portals diff --git a/lnet/tests/stopserver.sh b/lnet/tests/stopserver.sh new file mode 100644 index 0000000..3e81831 --- /dev/null +++ b/lnet/tests/stopserver.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingsrv +else + PING=pingsrv +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +killall -9 acceptor +rm -f /var/run/acceptor-9999.pid +rmmod portals diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am new file mode 100644 index 0000000..b62b401 --- /dev/null +++ b/lnet/ulnds/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lnet/ulnds/README b/lnet/ulnds/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lnet/ulnds/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lnet/ulnds/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include +#include +#include +#include +#include +#include +#include + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lnet/ulnds/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c new file mode 100644 index 0000000..89c9f78 --- /dev/null +++ b/lnet/ulnds/connection.c @@ -0,0 +1,293 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offsetm->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h new file mode 100644 index 0000000..f6b2994 --- /dev/null +++ b/lnet/ulnds/connection.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned int short); +manager init_connections(unsigned short, + int (*f)(void *,connection), + void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, + unsigned char *dest, + int len); diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lnet/ulnds/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lnet/ulnds/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lnet/ulnds/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lnet/ulnds/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include +#include +#include + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lnet/ulnds/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lnet/ulnds/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lnet/ulnds/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include +#include +#include + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lnet/ulnds/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lnet/ulnds/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am new file mode 100644 index 0000000..b62b401 --- /dev/null +++ b/lnet/ulnds/socklnd/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lnet/ulnds/socklnd/README b/lnet/ulnds/socklnd/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lnet/ulnds/socklnd/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lnet/ulnds/socklnd/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include +#include +#include +#include +#include +#include +#include + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lnet/ulnds/socklnd/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c new file mode 100644 index 0000000..89c9f78 --- /dev/null +++ b/lnet/ulnds/socklnd/connection.c @@ -0,0 +1,293 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offsetm->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h new file mode 100644 index 0000000..f6b2994 --- /dev/null +++ b/lnet/ulnds/socklnd/connection.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned int short); +manager init_connections(unsigned short, + int (*f)(void *,connection), + void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, + unsigned char *dest, + int len); diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lnet/ulnds/socklnd/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lnet/ulnds/socklnd/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lnet/ulnds/socklnd/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lnet/ulnds/socklnd/pqtimer.c b/lnet/ulnds/socklnd/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lnet/ulnds/socklnd/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include +#include +#include + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lnet/ulnds/socklnd/pqtimer.h b/lnet/ulnds/socklnd/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lnet/ulnds/socklnd/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lnet/ulnds/socklnd/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lnet/ulnds/socklnd/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include +#include +#include + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lnet/ulnds/socklnd/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lnet/ulnds/socklnd/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lnet/ulnds/socklnd/table.c b/lnet/ulnds/socklnd/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lnet/ulnds/socklnd/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;inext; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;isize;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lnet/ulnds/socklnd/table.h b/lnet/ulnds/socklnd/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lnet/ulnds/socklnd/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c new file mode 100644 index 0000000..8bf55c4 --- /dev/null +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -0,0 +1,196 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a,connection c) +{ + bridge b=a; + ptl_hdr_t hdr; + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lnet/ulnds/socklnd/timer.h b/lnet/ulnds/socklnd/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lnet/ulnds/socklnd/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lnet/ulnds/socklnd/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lnet/ulnds/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;inext; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;isize;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lnet/ulnds/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c new file mode 100644 index 0000000..8bf55c4 --- /dev/null +++ b/lnet/ulnds/tcplnd.c @@ -0,0 +1,196 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a,connection c) +{ + bridge b=a; + ptl_hdr_t hdr; + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lnet/ulnds/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lnet/ulnds/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore new file mode 100644 index 0000000..041cd6b --- /dev/null +++ b/lnet/utils/.cvsignore @@ -0,0 +1,7 @@ +Makefile +Makefile.in +acceptor +debugctl +ptlctl +.deps +routerstat diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am new file mode 100644 index 0000000..065fcf9 --- /dev/null +++ b/lnet/utils/Makefile.am @@ -0,0 +1,25 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +COMPILE = gcc -Wall -g -I$(srcdir)/../include +LINK = gcc -o $@ + +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat +lib_LIBRARIES = libptlctl.a + +acceptor_SOURCES = acceptor.c # -lefence + +libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h + +ptlctl_SOURCES = ptlctl.c +ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_DEPENDENCIES = libptlctl.a + +debugctl_SOURCES = debugctl.c +debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_DEPENDENCIES = libptlctl.a + +routerstat_SOURCES = routerstat.c diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c new file mode 100644 index 0000000..c6590db --- /dev/null +++ b/lnet/utils/acceptor.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* should get this from autoconf somehow */ +#ifndef PIDFILE_DIR +#define PIDFILE_DIR "/var/run" +#endif + +#define PROGNAME "acceptor" + +void create_pidfile(char *name, int port) +{ + char pidfile[1024]; + FILE *fp; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if ((fp = fopen(pidfile, "w"))) { + fprintf(fp, "%d\n", getpid()); + fclose(fp); + } else { + syslog(LOG_ERR, "%s: %s\n", pidfile, + strerror(errno)); + } +} + +int pidfile_exists(char *name, int port) +{ + char pidfile[1024]; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if (!access(pidfile, F_OK)) { + fprintf(stderr, "%s: exists, acceptor already running.\n", + pidfile); + return (1); + } + return (0); +} + +int +parse_size (int *sizep, char *str) +{ + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) + { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +void +show_connection (int fd, __u32 net_ip, ptl_nid_t nid) +{ + struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); + __u32 host_ip = ntohl (net_ip); + int rxmem = 0; + int txmem = 0; + int nonagle = 0; + int len; + char host[1024]; + + len = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0) + perror ("Cannot get write buffer size"); + + len = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0) + perror ("Cannot get read buffer size"); + + len = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0) + perror ("Cannot get nagle"); + + if (h == NULL) + snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, + (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); + else + snprintf (host, sizeof(host), "%s", h->h_name); + + syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", + host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled"); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +void +usage (char *myname) +{ + fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname); + exit (1); +} + +int main(int argc, char **argv) +{ + int o, fd, rc, port, pfd; + struct sockaddr_in srvaddr; + int c; + int rxmem = 0; + int txmem = 0; + int noclose = 0; + int nonagle = 1; + int nal = SOCKNAL; + int xchg_nids = 0; + int bind_irq = 0; + + while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1) + switch (c) + { + case 'r': + if (parse_size (&rxmem, optarg) != 0 || rxmem < 0) + usage (argv[0]); + break; + + case 's': + if (parse_size (&txmem, optarg) != 0 || txmem < 0) + usage (argv[0]); + break; + + case 'n': + nonagle = 0; + break; + + case 'l': + noclose = 1; + break; + + case 'x': + xchg_nids = 1; + break; + + case 'i': + bind_irq = 1; + break; + + case 'N': + if (parse_size(&nal, optarg) != 0 || + nal < 0 || nal > NAL_MAX_NR) + usage(argv[0]); + break; + + default: + usage (argv[0]); + break; + } + + if (optind >= argc) + usage (argv[0]); + + port = atol(argv[optind++]); + + if (pidfile_exists(PROGNAME, port)) + exit(1); + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = INADDR_ANY; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("opening socket"); + exit(1); + } + + o = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { + perror("Cannot set REUSEADDR socket opt"); + exit(1); + } + + if (nonagle) + { + o = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); + if (rc != 0) + { + perror ("Cannot disable nagle"); + exit (1); + } + } + + if (txmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem)); + if (rc != 0) + { + perror ("Cannot set write buffer size"); + exit (1); + } + } + + if (rxmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem)); + if (rc != 0) + { + perror ("Cannot set read buffer size"); + exit (1); + } + } + + rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + perror("bind: "); + exit(1); + } + + if (listen(fd, 127)) { + perror("listen: "); + exit(1); + } + fprintf(stderr, "listening on port %d\n", port); + + pfd = open("/dev/portals", O_RDWR); + if ( pfd < 0 ) { + perror("opening portals device"); + exit(1); + } + + rc = daemon(1, noclose); + if (rc < 0) { + perror("daemon(): "); + exit(1); + } + + openlog(PROGNAME, LOG_PID, LOG_DAEMON); + syslog(LOG_INFO, "started, listening on port %d\n", port); + create_pidfile(PROGNAME, port); + + while (1) { + struct sockaddr_in clntaddr; + int len = sizeof(clntaddr); + int cfd; + struct portal_ioctl_data data; + ptl_nid_t peer_nid; + + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); + if ( cfd < 0 ) { + perror("accept"); + exit(0); + continue; + } + + if (!xchg_nids) + peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */ + else + { + PORTAL_IOC_INIT (data); + data.ioc_nal = nal; + rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + { + perror ("Can't get my NID"); + close (cfd); + continue; + } + + rc = exchange_nids (cfd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (cfd); + continue; + } + } + + show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid); + + PORTAL_IOC_INIT(data); + data.ioc_fd = cfd; + data.ioc_nal = nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { + perror("ioctl failed"); + + } else { + printf("client registered\n"); + } + rc = close(cfd); + if (rc) + perror ("close failed"); + } + + closelog(); + exit(0); + +} diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c new file mode 100644 index 0000000..13572dc --- /dev/null +++ b/lnet/utils/debug.c @@ -0,0 +1,620 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define BUG() /* workaround for module.h includes */ +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#endif + +#include +#include +#include "parser.h" + +static char rawbuf[8192]; +static char *buf = rawbuf; +static int max = 8192; +//static int g_pfd = -1; +static int subsystem_array[1 << 8]; +static int debug_mask = ~0; + +static const char *portal_debug_subsystems[] = + {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite", + "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter", + "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL}; +static const char *portal_debug_masks[] = + {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", + "blocks", "net", "warning", "buffs", "other", "dentry", "portals", + "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL}; + +struct debug_daemon_cmd { + char *cmd; + unsigned int cmdv; +}; + +static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { + {"start", DEBUG_DAEMON_START}, + {"stop", DEBUG_DAEMON_STOP}, + {"pause", DEBUG_DAEMON_PAUSE}, + {"continue", DEBUG_DAEMON_CONTINUE}, + {0, 0} +}; + +static int do_debug_mask(char *name, int enable) +{ + int found = 0, i; + + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || + strcasecmp(name, "all_subs") == 0) { + printf("%s output from subsystem \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_subsystems[i]); + subsystem_array[i] = enable; + found = 1; + } + } + for (i = 0; portal_debug_masks[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_masks[i]) == 0 || + strcasecmp(name, "all_types") == 0) { + printf("%s output of type \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_masks[i]); + if (enable) + debug_mask |= (1 << i); + else + debug_mask &= ~(1 << i); + found = 1; + } + } + + return found; +} + +int dbg_initialize(int argc, char **argv) +{ + memset(subsystem_array, 1, sizeof(subsystem_array)); + return 0; +} + +int jt_dbg_filter(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 0)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + return 0; +} + +int jt_dbg_show(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 1)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + + return 0; +} + +static int applymask(char* procpath, int value) +{ + int rc; + char buf[64]; + int len = snprintf(buf, 64, "%d", value); + + int fd = open(procpath, O_WRONLY); + if (fd == -1) { + fprintf(stderr, "Unable to open %s: %s\n", + procpath, strerror(errno)); + return fd; + } + rc = write(fd, buf, len+1); + if (rc<0) { + fprintf(stderr, "Write to %s failed: %s\n", + procpath, strerror(errno)); + return rc; + } + close(fd); + return 0; +} + +extern char *dump_filename; +extern int dump(int dev_id, int opc, void *buf); + +static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) +{ + if (!dump_filename) { + applymask("/proc/sys/portals/subsystem_debug", subs_mask); + applymask("/proc/sys/portals/debug", debug_mask); + } else { + struct portals_debug_ioctl_data data; + + data.hdr.ioc_len = sizeof(data); + data.hdr.ioc_version = 0; + data.subs = subs_mask; + data.debug = debug_mask; + + dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); + } + printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", + subs_mask, debug_mask); +} + +int jt_dbg_list(int argc, char **argv) +{ + int i; + + if (argc != 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 0; + } + + if (strcasecmp(argv[1], "subs") == 0) { + printf("Subsystems: all_subs"); + for (i = 0; portal_debug_subsystems[i] != NULL; i++) + printf(", %s", portal_debug_subsystems[i]); + printf("\n"); + } else if (strcasecmp(argv[1], "types") == 0) { + printf("Types: all_types"); + for (i = 0; portal_debug_masks[i] != NULL; i++) + printf(", %s", portal_debug_masks[i]); + printf("\n"); + } + else if (strcasecmp(argv[1], "applymasks") == 0) { + unsigned int subsystem_mask = 0; + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (subsystem_array[i]) subsystem_mask |= (1 << i); + } + applymask_all(subsystem_mask, debug_mask); + } + return 0; +} + +/* if 'raw' is true, don't strip the debug information from the front of the + * lines */ +static void dump_buffer(FILE *fd, char *buf, int size, int raw) +{ + char *p, *z; + unsigned long subsystem, debug, dropped = 0, kept = 0; + int max_sub, max_type; + + for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++) + ; + for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++) + ; + + while (size) { + p = memchr(buf, '\n', size); + if (!p) + break; + subsystem = strtoul(buf, &z, 16); + debug = strtoul(z + 1, &z, 16); + + z++; + /* for some reason %*s isn't working. */ + *p = '\0'; + if (subsystem < max_sub && + subsystem_array[subsystem] && + (!debug || (debug_mask & debug))) { + if (raw) + fprintf(fd, "%s\n", buf); + else + fprintf(fd, "%s\n", z); + //printf("%s\n", buf); + kept++; + } else { + //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf); + dropped++; + } + *p = '\n'; + p++; + size -= (p - buf); + buf = p; + } + + printf("Debug log: %lu lines, %lu kept, %lu dropped.\n", + dropped + kept, kept, dropped); +} + +int jt_dbg_debug_kernel(int argc, char **argv) +{ + int rc, raw = 1; + FILE *fd = stdout; + const int databuf_size = (6 << 20); + struct portal_ioctl_data data, *newdata; + char *databuf = NULL; + + if (argc > 3) { + fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); + return 0; + } + + if (argc > 1) { + fd = fopen(argv[1], "w"); + if (fd == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } + } + if (argc > 2) + raw = atoi(argv[2]); + + databuf = malloc(databuf_size); + if (!databuf) { + fprintf(stderr, "No memory for buffer.\n"); + goto out; + } + + memset(&data, 0, sizeof(data)); + data.ioc_plen1 = databuf_size; + data.ioc_pbuf1 = databuf; + + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + goto out; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n", + strerror(errno)); + goto out; + } + + newdata = (struct portal_ioctl_data *)buf; + if (newdata->ioc_size > 0) + dump_buffer(fd, databuf, newdata->ioc_size, raw); + else + fprintf(stderr, "No data in the debug buffer.\n"); + + out: + if (databuf) + free(databuf); + if (fd != stdout) + fclose(fd); + return 0; +} + +int jt_dbg_debug_daemon(int argc, char **argv) +{ + int i, rc; + unsigned int cmd = 0; + FILE *fd = stdout; + struct portal_ioctl_data data; + + if (argc <= 1) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) { + if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) { + cmd = portal_debug_daemon_cmd[i].cmdv; + break; + } + } + if (portal_debug_daemon_cmd[i].cmd == NULL) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + memset(&data, 0, sizeof(data)); + if (cmd == DEBUG_DAEMON_START) { + if (argc < 3) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|" + "pause|continue]\n", argv[0]); + return 0; + } + if (access(argv[2], F_OK) != 0) { + fd = fopen(argv[2], "w"); + if (fd != NULL) { + fclose(fd); + remove(argv[2]); + goto ok; + } + } + if (access(argv[2], W_OK) == 0) + goto ok; + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + return -1; +ok: + data.ioc_inllen1 = strlen(argv[2]) + 1; + data.ioc_inlbuf1 = argv[2]; + data.ioc_misc = 0; + if (argc == 4) { + unsigned long size; + errno = 0; + size = strtoul(argv[3], NULL, 0); + if (errno) { + fprintf(stderr, "file size(%s): error %s\n", + argv[3], strerror(errno)); + return -1; + } + data.ioc_misc = size; + } + } + data.ioc_count = cmd; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf); + if (rc < 0) { + fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n", + strerror(errno)); + return rc; + } + return 0; +} + +int jt_dbg_debug_file(int argc, char **argv) +{ + int rc, fd = -1, raw = 1; + FILE *output = stdout; + char *databuf = NULL; + struct stat statbuf; + + if (argc > 4 || argc < 2) { + fprintf(stderr, "usage: %s [output] [raw]\n", argv[0]); + return 0; + } + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + rc = syscall(__SYS_fstat__, fd, &statbuf); + if (rc < 0) { + fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + goto out; + } + + if (argc >= 3) { + output = fopen(argv[2], "w"); + if (output == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + goto out; + } + } + + if (argc == 4) + raw = atoi(argv[3]); + + databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (databuf == NULL) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + goto out; + } + + dump_buffer(output, databuf, statbuf.st_size, raw); + + out: + if (databuf) + munmap(databuf, statbuf.st_size); + if (output != stdout) + fclose(output); + if (fd > 0) + close(fd); + return 0; +} + +int jt_dbg_clear_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_dbg_mark_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + char *text; + time_t now = time(NULL); + + if (argc > 2) { + fprintf(stderr, "usage: %s [marker text]\n", argv[0]); + return 0; + } + + if (argc == 2) { + text = argv[1]; + } else { + text = ctime(&now); + text[strlen(text) - 1] = '\0'; /* stupid \n */ + } + + memset(&data, 0, sizeof(data)); + data.ioc_inllen1 = strlen(text) + 1; + data.ioc_inlbuf1 = text; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + + +int jt_dbg_modules(int argc, char **argv) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct mod_paths { + char *name, *path; + } *mp, mod_paths[] = { + {"portals", "portals/linux/oslib"}, + {"ksocknal", "portals/linux/socknal"}, + {"obdclass", "lustre/obdclass"}, + {"ptlrpc", "lustre/ptlrpc"}, + {"obdext2", "lustre/obdext2"}, + {"ost", "lustre/ost"}, + {"osc", "lustre/osc"}, + {"mds", "lustre/mds"}, + {"mdc", "lustre/mdc"}, + {"llite", "lustre/llite"}, + {"obdecho", "lustre/obdecho"}, + {"ldlm", "lustre/ldlm"}, + {"obdfilter", "lustre/obdfilter"}, + {"extN", "lustre/extN"}, + {"lov", "lustre/lov"}, + {"fsfilt_ext3", "lustre/obdclass"}, + {"fsfilt_extN", "lustre/obdclass"}, + {"mds_ext2", "lustre/mds"}, + {"mds_ext3", "lustre/mds"}, + {"mds_extN", "lustre/mds"}, + {"ptlbd", "lustre/ptlbd"}, + {NULL, NULL} + }; + char *path = ".."; + char *kernel = "linux"; + + if (argc >= 2) + path = argv[1]; + if (argc == 3) + kernel = argv[2]; + if (argc > 3) { + printf("%s [path] [kernel]\n", argv[0]); + return 0; + } + + for (mp = mod_paths; mp->name != NULL; mp++) { + struct module_info info; + int rc; + size_t crap; + int query_module(const char *name, int which, void *buf, + size_t bufsize, size_t *ret); + + rc = query_module(mp->name, QM_INFO, &info, sizeof(info), + &crap); + if (rc < 0) { + if (errno != ENOENT) + printf("query_module(%s) failed: %s\n", + mp->name, strerror(errno)); + } else { + printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, + mp->path, mp->name, + info.addr + sizeof(struct module)); + } + } + + return 0; +#else + printf("jt_dbg_module is not yet implemented for Linux 2.5\n"); + return 0; +#endif /* linux 2.5 */ +} + +int jt_dbg_panic(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} diff --git a/lnet/utils/debugctl.c b/lnet/utils/debugctl.c new file mode 100644 index 0000000..02cb9b4 --- /dev/null +++ b/lnet/utils/debugctl.c @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include "parser.h" + + +command_t list[] = { + {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"}, + {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, + {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file [output] [raw], read debug buffer from input and print it [to output]"}, + {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"}, + {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"}, + {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"}, + {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"}, + {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"}, + {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: )"}, + {"panic", jt_dbg_panic, 0, "cause the kernel to panic"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (dbg_initialize(argc, argv) < 0) + exit(2); + + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + + Parser_init("debugctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + unregister_ioc_dev(PORTALS_DEV_ID); + return 0; +} diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c new file mode 100644 index 0000000..722bb57 --- /dev/null +++ b/lnet/utils/l_ioctl.c @@ -0,0 +1,281 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ioc_dev { + const char * dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +struct dump_hdr { + int magic; + int dev_id; + int opc; +}; + +char * dump_filename; + +static int +open_ioc_dev(int dev_id) +{ + const char * dev_name; + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + + +static int +do_ioctl(int dev_id, int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + return rc; + +} + +static FILE * +get_dump_file() +{ + FILE *fp = NULL; + + if (!dump_filename) { + fprintf(stderr, "no dump filename\n"); + } else + fp = fopen(dump_filename, "a"); + return fp; +} + +/* + * The dump file should start with a description of which devices are + * used, but for now it will assumed whatever app reads the file will + * know what to do. */ +int +dump(int dev_id, int opc, void *buf) +{ + FILE *fp; + struct dump_hdr dump_hdr; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + int rc; + + printf("dumping opc %x to %s\n", opc, dump_filename); + + + dump_hdr.magic = 0xdeadbeef; + dump_hdr.dev_id = dev_id; + dump_hdr.opc = opc; + + fp = get_dump_file(); + if (fp == NULL) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char * dev_name) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return; + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +/* If this file is set, then all ioctl buffers will be + appended to the file. */ +int +set_ioctl_dump(char * file) +{ + if (dump_filename) + free(dump_filename); + + dump_filename = strdup(file); + return 0; +} + +int +l_ioctl(int dev_id, int opc, void *buf) +{ + if (dump_filename) + return dump(dev_id, opc, buf); + else + return do_ioctl(dev_id, opc, buf); +} + +/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer + * in the file. For example: + * + * parse_dump("lctl.dump", l_ioctl); + * + * Note: if using l_ioctl, then you also need to register_ioc_dev() for + * each device used in the dump. + */ +int +parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) +{ + int fd, line =0; + struct stat st; + char *buf, *end; + + fd = syscall(SYS_open, dump_file, O_RDONLY); + +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + if (syscall(__SYS_fstat__, fd, &st)) { + perror("stat fails"); + exit(1); + } + + if (st.st_size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } + + buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = buf + st.st_size; + close(fd); + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } +#if 0 + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); +#endif + + memcpy(tmp, data, data->ioc_len); + + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } + + buf += data->ioc_len + sizeof(*dump_hdr); + } + return 0; +} + +int +jt_ioc_dump(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + printf("setting dumpfile to: %s\n", argv[1]); + + set_ioctl_dump(argv[1]); + return 0; +} diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c new file mode 100644 index 0000000..4d93645 --- /dev/null +++ b/lnet/utils/parser.c @@ -0,0 +1,703 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef HAVE_LIBREADLINE +#define READLINE_LIBRARY +#include +#endif +//extern char **completion_matches __P((char *, rl_compentry_func_t *)); +extern void using_history(void); +extern void stifle_history(int); +extern void add_history(char *); + +#include "parser.h" + +static command_t * top_level; /* Top level of commands, initialized by + * InitParser */ +static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ +static int done; /* Set to 1 if user types exit or quit */ + + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); +static void print_commands(char *str, command_t *table); + +static char * skipwhitespace(char * s) +{ + char * t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++); + return(t); +} + + +static char * skiptowhitespace(char * s) +{ + char * t; + + for (t = s; *t && !isspace(*t); t++); + return(t); +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if ( arg ) { + argv[i] = arg; + i++; + } else + return 0; + + while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) { + argv[i] = arg; + i++; + } + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if ( cmd ) { + return (cmd->pc_func)(argc, argv); + } else { + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\" ", cmd->pc_name); + printf("\nas argument.\n"); + } + return -1; +} + +/* returns the command_t * (NULL if not found) corresponding to a + _partial_ match with the first token in name. It sets *next to + point to the following token. Does not modify *name. */ +static command_t * find_cmd(char * name, command_t cmds[], char ** next) +{ + int i, len; + + if (!cmds || !name ) + return NULL; + + /* This sets name to point to the first non-white space character, + and next to the first whitespace after name, len to the length: do + this with strtok*/ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = *next - name; + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return(&cmds[i]); + } + } + return NULL; +} + +/* Recursively process a command line string s and find the command + corresponding to it. This can be ambiguous, full, incomplete, + non-existent. */ +static int process(char *s, char ** next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if ( ! *result ) + return CMD_NONE; + + /* found entry: is it ambigous, i.e. not exact command name and + more than one command in the list matches. Note that find_cmd + points to the first ambiguous entry */ + if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) && + find_cmd(s, (*result) + 1, next)) + return CMD_AMBIG; + + /* found a unique command: component or full? */ + if ( (*result)->pc_func ) { + return CMD_COMPLETE; + } else { + if ( *next == '\0' ) { + return CMD_INCOMPLETE; + } else { + return process(*next, next, (*result)->pc_sub_cmd, result, prev); + } + } +} + +#ifdef HAVE_LIBREADLINE +static command_t * match_tbl; /* Command completion against this table */ +static char * command_generator(const char * text, int state) +{ + static int index, + len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ( (name = (match_tbl + index)->pc_name) ) { + index++; + + if (strncasecmp(name, text, len) == 0) { + return(strdup(name)); + } + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(char * text, int start, int end) +{ + command_t * table; + char * pos; + + match_tbl = top_level; + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; + table = find_cmd(pos, match_tbl, &pos)) { + + if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; + } + + return(completion_matches(text, command_generator)); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char * line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch( process(line, &next, top_level, &cmd, &prev) ) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, cmd, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + i = line2args(line, argv, MAXARGS); + rc = (cmd->pc_func)(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +int +noop_fn () +{ + return (0); +} + +/* just in case you're ever in an airplane and discover you + forgot to install readline-dev. :) */ +int init_input() +{ + int interactive = isatty (fileno (stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) + { + rl_prep_term_function = (rl_vintfunc_t *)noop_fn; + rl_deprep_term_function = (rl_voidfunc_t *)noop_fn; + } + + rl_attempted_completion_function = (CPPFunction *)command_completion; + rl_completion_entry_function = (void *)command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char * readline(char * prompt) +{ + char line[2048]; + int n = 0; + if (prompt) + printf ("%s", prompt); + if (fgets(line, sizeof(line), stdin) == NULL) + return (NULL); + n = strlen(line); + if (n && line[n-1] == '\n') + line[n-1] = '\0'; + return strdup(line); +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0; + int interactive; + + interactive = init_input(); + + while(!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + + free(line); + } + return rc; +} + + +/* sets the parser prompt */ +void Parser_init(char * prompt, command_t * cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') + ret = sscanf(s, "%d", val); + else if (*(s+1) != 'x') + ret = sscanf(s, "%o", val); + else { + s++; + ret = sscanf(++s, "%x", val); + } + + return(ret); +} + + +void Parser_qhelp(int argc, char *argv[]) { + + printf("Available commands are:\n"); + + print_commands(NULL, top_level); + printf("For more help type: help command-name\n"); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if ( argc == 1 ) { + Parser_qhelp(argc, argv); + return 0; + } + + line[0]='\0'; + for ( i = 1 ; i < argc ; i++ ) { + strcat(line, argv[i]); + } + + switch ( process(line, &next, top_level, &result, &prev) ) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n",line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, result, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + Parser_help(2, argv); +} + +/************************************************************************* + * COMMANDS * + *************************************************************************/ + + +static void print_commands(char * str, command_t * table) { + command_t * cmds; + char buf[80]; + + for (cmds = table; cmds->pc_name; cmds++) { + if (cmds->pc_func) { + if (str) printf("\t%s %s\n", str, cmds->pc_name); + else printf("\t%s\n", cmds->pc_name); + } + if (cmds->pc_sub_cmd) { + if (str) { + sprintf(buf, "%s %s", str, cmds->pc_name); + print_commands(buf, cmds->pc_sub_cmd); + } else { + print_commands(cmds->pc_name, cmds->pc_sub_cmd); + } + } + } +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + theprompt = malloc(size); + assert(theprompt); + + sprintf(theprompt, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + if ( line == NULL || *line == '\0' ) { + strncpy(res, deft, len); + } else { + strncpy(res, line, len); + } + + if ( line ) { + free(line); + return res; + } else { + return NULL; + } +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + assert(theprompt); + sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); + + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if ( !line ) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if ( *line == '\0' ) { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if ( rc != 0 ) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if ( result > max || result < min ) { + fprintf(stdout, "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while ( 1 ) ; + + if (theprompt) + free(theprompt); + return result; + +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + assert(theprompt); + + fflush(stdout); + + if ( deft != 0 && deft != 1 ) { + fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n", + deft); + assert ( 0 ); + } + sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if ( line == NULL ) { + result = deft; + break; + } + if ( *line == '\0' ) { + result = deft; + break; + } + if ( *line == 'y' || *line == 'Y' ) { + result = 1; + break; + } + if ( *line == 'n' || *line == 'N' ) { + result = 0; + break; + } + if ( line ) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while ( 1 ); + + if ( line ) + free(line); + if ( theprompt ) + free(theprompt); + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + + if ( rc == 0 ) { + return result; + } else { + return Parser_getint(prompt, deft, min, max, base); + } +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if ( inp == NULL || *inp == '\0' ) { + return Parser_getstr(prompt, deft, answer, len); + } else + return inp; +} + +/* change a string into a number: return 0 on success. No invalid characters + allowed. The processing of base and validity follows strtol(3)*/ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ( (base !=0) && (base < 2 || base > 36) ) + return 1; + + *result = strtol(inp, &endptr, base); + + if ( *inp != '\0' && *endptr == '\0' ) + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size (int *sizep, char *str) { + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} diff --git a/lnet/utils/parser.h b/lnet/utils/parser.h new file mode 100644 index 0000000..dead9f5 --- /dev/null +++ b/lnet/utils/parser.h @@ -0,0 +1,73 @@ +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 100 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(int *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c new file mode 100644 index 0000000..8235271 --- /dev/null +++ b/lnet/utils/portals.c @@ -0,0 +1,1005 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "parser.h" + +unsigned int portal_debug; +unsigned int portal_printk; +unsigned int portal_stack; + + +static ptl_nid_t g_nid = 0; +static unsigned int g_nal = 0; +static unsigned short g_port = 0; + +static int g_socket_txmem = 0; +static int g_socket_rxmem = 0; +static int g_socket_nonagle = 1; + +typedef struct +{ + char *name; + int num; +} name2num_t; + +static name2num_t nalnames[] = { + {"tcp", SOCKNAL}, + {"toe", TOENAL}, + {"elan", QSWNAL}, + {"gm", GMNAL}, + {"scimac", SCIMACNAL}, + {NULL, -1} +}; + +static name2num_t * +name2num_lookup_name (name2num_t *table, char *str) +{ + while (table->name != NULL) + if (!strcmp (str, table->name)) + return (table); + else + table++; + return (NULL); +} + +static name2num_t * +name2num_lookup_num (name2num_t *table, int num) +{ + while (table->name != NULL) + if (num == table->num) + return (table); + else + table++; + return (NULL); +} + +int +ptl_name2nal (char *str) +{ + name2num_t *e = name2num_lookup_name (nalnames, str); + + return ((e == NULL) ? 0 : e->num); +} + +static char * +nal2name (int nal) +{ + name2num_t *e = name2num_lookup_num (nalnames, nal); + + return ((e == NULL) ? "???" : e->name); +} + +static int +nid2nal (ptl_nid_t nid) +{ + /* BIG pragmatic assumption */ + return ((((__u32)nid) & 0xffff0000) != 0 ? SOCKNAL : QSWNAL); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + struct hostent *he; + int a; + int b; + int c; + int d; + + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) + { + __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; + + *nidp = (ptl_nid_t)addr; + return (0); + } + + if ((('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) && + (he = gethostbyname (str)) != NULL) + { + __u32 addr = *(__u32 *)he->h_addr; + + *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + return (0); + } + + if (sscanf (str, "%i", &a) == 1) + { + *nidp = (ptl_nid_t)a; + return (0); + } + + if (sscanf (str, "%x", &a) == 1) + { + *nidp = (ptl_nid_t) a; + return (0); + } + + return (-1); +} + +char * +ptl_nid2str (char *buffer, ptl_nid_t nid) +{ + switch (nid2nal(nid)) + { + case QSWNAL: + sprintf (buffer, LPD64, nid); + return (buffer); + + case SCIMACNAL: + sprintf (buffer, LPX64, nid); + return (buffer); + + case SOCKNAL: { + __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ + struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); + + if (he != NULL) + strcpy (buffer, he->h_name); + else + { + addr = (__u32)nid; + sprintf (buffer, "%d.%d.%d.%d", + (addr>>24)&0xff, (addr>>16)&0xff, (addr>>8)&0xff, addr&0xff); + } + return (buffer); + } + + default: + sprintf (buffer, "nid2nal broken"); + return (buffer); + } +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int ptl_initialize(int argc, char **argv) +{ + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + return 0; +} + + +int jt_ptl_network(int argc, char **argv) +{ + int nal; + + if (argc != 2 || + (nal = ptl_name2nal (argv[1])) == 0) + { + name2num_t *entry; + + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + } + else + g_nal = nal; + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +int jt_ptl_connect(int argc, char **argv) +{ + if (argc < 2) { + usage: + fprintf(stderr, "usage: %s or \n", + argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + ptl_nid_t peer_nid; + struct hostent *he; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int o; + int olen; + + if (argc < 3) { + goto usage; + } + + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + g_port = atol(argv[2]); + + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); + return (-1); + } + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(g_port); + srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", + strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_rxmem != 0) + { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_txmem != 0) + { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", + strerror(errno)); + return -1; + } + + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (xchange_nids) { + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) + { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); + close (fd); + return (-1); + } + + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (fd); + return (-1); + } + } + else + peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: " + "%s\n", strerror(errno)); + close (fd); + return -1; + } + + g_nid = peer_nid; + printf("Connection to "LPX64" registered with socknal\n", g_nid); + + rc = close(fd); + if (rc) { + fprintf(stderr, "close failed: %d\n", rc); + } + } else if (g_nal == QSWNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == GMNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == SCIMACNAL) { + unsigned int tmpnid; + if(sscanf(argv[1], "%x", &tmpnid) == 1) { + g_nid=tmpnid; + } + else { + fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + } + + + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + } + + return 0; +} + +int jt_ptl_disconnect(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Disconnecting ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'disconnect' doesn't make any sense for " + "elan.\n"); + } else if (g_nal == GMNAL) { + printf("'disconnect' doesn't make any sense for " + "GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'disconnect' doesn't make any sense for " + "SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_push_connection (int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Pushing ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'push' doesn't make any sense for elan.\n"); + } else if (g_nal == GMNAL) { + printf("'push' doesn't make any sense for GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'push' doesn't make any sense for SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_ping(int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + long count = 1; + long size = 4; + long timeout = 1; + struct portal_ioctl_data data; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc > 2) + { + count = atol(argv[2]); + + if (count < 0 || count > 20000) + { + fprintf(stderr, "are you insane? %ld is a crazy count.\n", count); + return -1; + } + } + + if (argc > 3) + size= atol(argv[3]); + + if (argc > 4) + timeout = atol (argv[4]); + + PORTAL_IOC_INIT (data); + data.ioc_count = count; + data.ioc_size = size; + data.ioc_nid = nid; + data.ioc_nal = g_nal; + data.ioc_timeout = timeout; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); + if (rc) { + fprintf(stderr, "failed to start pinger: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_ptl_mynid(int argc, char **argv) +{ + int rc; + struct hostent *h; + char buf[1024], *hostname; + struct portal_ioctl_data data; + ptl_nid_t mynid; + + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + fprintf(stderr, "hostname defaults to the hostname of the " + "machine.\n"); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (g_nal == QSWNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for elan.\n"); + return -1; + } else if (g_nal == GMNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for GM.\n"); + return -1; + } else if (g_nal == SCIMACNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for SCI.\n"); + return -1; + } + + if (g_nal != SOCKNAL && g_nal != TOENAL) { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + if (argc == 1) { + if (gethostname(buf, sizeof(buf)) != 0) { + fprintf(stderr, "gethostname failed: %s\n", + strerror(errno)); + return -1; + } + hostname = buf; + } else { + hostname = argv[1]; + } + + h = gethostbyname(hostname); + + if (!h) { + fprintf(stderr, "cannot get address for host '%s': %d\n", + hostname, h_errno); + return -1; + } + mynid = (ptl_nid_t)ntohl (*(__u32 *)h->h_addr); /* HOST byte order */ + + PORTAL_IOC_INIT(data); + data.ioc_nid = mynid; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc < 0) + fprintf(stderr, "IOC_PORTAL_REGISTER_MYNID failed: %s\n", + strerror(errno)); + else + printf("registered my nid "LPX64" (%s)\n", mynid, hostname); + return 0; +} + +int +jt_ptl_fail_nid (int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + unsigned int threshold; + struct portal_ioctl_data data; + + if (argc < 2 || argc > 3) + { + fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (!strcmp (argv[1], "_all_")) + nid = PTL_NID_ANY; + else if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc < 3) + threshold = PTL_MD_THRESH_INF; + else if (sscanf (argv[2], "%i", &threshold) != 1) { + fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); + return (-1); + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_count = threshold; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); + if (rc < 0) + fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", + strerror (errno)); + else + printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); + + return (0); +} + +int +jt_ptl_rxmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + + g_socket_rxmem = size; + } + printf ("Socket rmem = %d\n", g_socket_rxmem); + return (0); +} + +int +jt_ptl_txmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + g_socket_txmem = size; + } + printf ("Socket txmem = %d\n", g_socket_txmem); + return (0); +} + +int +jt_ptl_nagle (int argc, char **argv) +{ + int enable; + + if (argc > 1) + { + if (Parser_bool (&enable, argv[1]) != 0) + { + fprintf (stderr, "Can't parse boolean %s\n", argv[1]); + return (0); + } + g_socket_nonagle = !enable; + } + printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled"); + return (0); +} + +int +jt_ptl_add_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid1; + ptl_nid_t nid2; + ptl_nid_t gateway_nid; + int gateway_nal; + int rc; + + if (argc < 3) + { + fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); + return (-1); + } + + gateway_nal = nid2nal (gateway_nid); + + if (ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) + nid2 = nid1; + else if (ptl_parse_nid (&nid2, argv[3]) != 0) + { + fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = gateway_nid; + data.ioc_nal = gateway_nal; + data.ioc_nid2 = MIN (nid1, nid2); + data.ioc_nid3 = MAX (nid1, nid2); + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_del_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + int rc; + + if (argc < 2) + { + fprintf (stderr, "usage: %s targetNID\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_print_routes (int argc, char **argv) +{ + char buffer[3][128]; + struct portal_ioctl_data data; + int rc; + int index; + int gateway_nal; + ptl_nid_t gateway_nid; + ptl_nid_t nid1; + ptl_nid_t nid2; + + + for (index = 0;;index++) + { + PORTAL_IOC_INIT(data); + data.ioc_count = index; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data); + if (rc != 0) + break; + + gateway_nal = data.ioc_nal; + gateway_nid = data.ioc_nid; + nid1 = data.ioc_nid2; + nid2 = data.ioc_nid3; + + printf ("%8s %18s : %s - %s\n", + nal2name (gateway_nal), + ptl_nid2str (buffer[0], gateway_nid), + ptl_nid2str (buffer[1], nid1), + ptl_nid2str (buffer[2], nid2)); + } + return (0); +} + diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c new file mode 100644 index 0000000..d38bd4a --- /dev/null +++ b/lnet/utils/ptlctl.c @@ -0,0 +1,64 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include + +#include "parser.h" + + +command_t list[] = { + {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: | for tcp/elan respectively)"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, + {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, + {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, + {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, + {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (ptl_initialize(argc, argv) < 0) + exit(1); + + Parser_init("ptlctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + return 0; +} diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c new file mode 100644 index 0000000..37da12c --- /dev/null +++ b/lnet/utils/routerstat.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +double +timenow () +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000.0); +} + +void +do_stat (int fd) +{ + static char buffer[1024]; + static double last = 0.0; + double now; + double t; + long long bytes; + long packets; + long errors; + long depth; + int n; + + lseek (fd, 0, SEEK_SET); + now = timenow(); + n = read (fd, buffer, sizeof (buffer)); + if (n < 0) + { + fprintf (stderr, "Can't read statfile\n"); + exit (1); + } + buffer[n] = 0; + + n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth); + + if (n < 3) + { + fprintf (stderr, "Can't parse statfile\n"); + exit (1); + } + + if (last == 0.0) + printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", + bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors); + else + { + t = now - last; + + printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", + bytes, ((double)bytes)/((1<<20) * t), + packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t), + errors, (long)(errors/t)); + } + + if (n == 4) + printf (" (%ld)\n", depth); + else + printf ("\n"); + + fflush (stdout); + + lseek (fd, 0, SEEK_SET); + write (fd, "\n", 1); + last = timenow(); +} + +int main (int argc, char **argv) +{ + int interval = 0; + int fd; + + if (argc > 1) + interval = atoi (argv[1]); + + fd = open ("/proc/sys/portals/router", O_RDWR); + if (fd < 0) + { + fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); + return (1); + } + + do_stat (fd); + if (interval == 0) + return (0); + + for (;;) + { + sleep (interval); + do_stat (fd); + } +} diff --git a/lustre/Makefile.mk b/lustre/Makefile.mk new file mode 100644 index 0000000..e540148 --- /dev/null +++ b/lustre/Makefile.mk @@ -0,0 +1,4 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += mds/ diff --git a/lustre/mds/Makefile.mk b/lustre/mds/Makefile.mk new file mode 100644 index 0000000..6b712fb --- /dev/null +++ b/lustre/mds/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += mds.o + +mds-objs := mds_lov.o handler.o mds_reint.o mds_fs.o lproc_mds.o mds_internal.h mds_updates.o mds_open.o simple.o target.o diff --git a/lustre/portals/AUTHORS b/lustre/portals/AUTHORS new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/ChangeLog b/lustre/portals/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/Kernelenv.in b/lustre/portals/Kernelenv.in new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lustre/portals/Kernelenv.in @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lustre/portals/Kernelenv.mk b/lustre/portals/Kernelenv.mk new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lustre/portals/Kernelenv.mk @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lustre/portals/Makefile.am b/lustre/portals/Makefile.am new file mode 100644 index 0000000..3c42103 --- /dev/null +++ b/lustre/portals/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = Rules.linux archdep.m4 MCP +DIST_SUBDIRS = libcfs portals knals unals utils tests doc router +SUBDIRS = libcfs portals knals unals utils tests doc router diff --git a/lustre/portals/Makefile.mk b/lustre/portals/Makefile.mk new file mode 100644 index 0000000..be0e51a --- /dev/null +++ b/lustre/portals/Makefile.mk @@ -0,0 +1,6 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += libcfs/ +obj-y += knals/ +obj-y += router/ diff --git a/lustre/portals/NEWS b/lustre/portals/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/README b/lustre/portals/README new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/Rules.linux.in b/lustre/portals/Rules.linux.in new file mode 100644 index 0000000..8247deb --- /dev/null +++ b/lustre/portals/Rules.linux.in @@ -0,0 +1,37 @@ +# included in Linux kernel directories +# Rules for module building + +MODLINK=@MOD_LINK@ +if LINUX25 + + +basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g') +AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2 -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename) + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + + + +else + + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + + + +endif + + +tags: + rm -f $(top_srcdir)/TAGS + rm -f $(top_srcdir)/tags + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a + + + + diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 new file mode 100644 index 0000000..0315644 --- /dev/null +++ b/lustre/portals/archdep.m4 @@ -0,0 +1,206 @@ + +# -------- in kernel compilation? (2.5 only) ------------- +AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles]) +AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) +echo "Makefile for in kernel build: $INKERNEL" + +# -------- liblustre compilation -------------- +AC_ARG_WITH(lib, [ --with-lib compile lustre library], host_cpu="lib") + +# -------- set linuxdir ------------ + +AC_ARG_WITH(linux, [ --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux) +AC_SUBST(LINUX) + +# --------- UML? -------------------- +AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...) +if test $host_cpu = "lib" ; then + host_cpu="lib" + AC_MSG_RESULT(no building Lustre library) +else + if test -e $LINUX/include/asm-um ; then + if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then + host_cpu="um"; + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no (asm doesn't point at asm-um)) + fi + + else + AC_MSG_RESULT(no (asm-um missing)) + fi +fi + +# --------- Linux 25 ------------------ + +AC_MSG_CHECKING(if you are running linux 2.5) +if test -e $LINUX/include/linux/namei.h ; then + linux25="yes" + AC_MSG_RESULT(yes) +else + linux25="no" + AC_MSG_RESULT(no) +fi +AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) +echo "Makefiles for in linux 2.5 build: $LINUX25" + +# ------- Makeflags ------------------ + +AC_MSG_CHECKING(setting make flags system architecture: ) +case ${host_cpu} in + lib ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall ' + KCPPFLAGS='-D__arch_lib__ ' + MOD_LINK=elf_i386 +;; + um ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common ' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include ' + ;; + esac + + MOD_LINK=elf_i386 +;; + i*86 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + ;; + esac + MOD_LINK=elf_i386 +;; + + alphaev6 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alphaev67 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alpha* ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + ia64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' + KCPPFLAGS='-D__KERNEL__ -DMODULE' + MOD_LINK=elf64_ia64 +;; + + sparc64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf64_sparc + +;; + + powerpc ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf32ppclinux +;; + + *) + AC_ERROR("Unknown Linux Platform: $host_cpu") +;; +esac + +# ----------- make dep run? ------------------ + +if test $host_cpu != "lib" ; then + AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) ) + if test -f $LINUX/include/linux/config.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.) + fi +fi + +# ------------ include paths ------------------ + +if test $host_cpu != "lib" ; then + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include -I$(LINUX)/include' +else + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include' +fi +CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS" + +if test $host_cpu != "lib" ; then +# ------------ autoconf.h ------------------ + AC_MSG_CHECKING(if autoconf.h is in kernel source) + if test -f $LINUX/include/linux/autoconf.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.) + fi + +# ------------ RELEASE and moduledir ------------------ + AC_MSG_CHECKING(for Linux release) + + dnl We need to rid ourselves of the nasty [ ] quotes. + changequote(, ) + dnl Get release from version.h + RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`" + changequote([, ]) + + moduledir='$(libdir)/modules/'$RELEASE/kernel + AC_SUBST(moduledir) + + modulefsdir='$(moduledir)/fs/$(PACKAGE)' + AC_SUBST(modulefsdir) + + AC_MSG_RESULT($RELEASE) + AC_SUBST(RELEASE) + +# ---------- modversions? -------------------- + AC_MSG_CHECKING(for MODVERSIONS) + if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1; + then + MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB" + AC_MSG_RESULT(yes) + else + MFLAGS= + AC_MSG_RESULT(no) + fi +fi + +# ---------- SMP ------------------- +#AC_MSG_CHECKING(for SMP) +#if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then +# SMPFLAG= +# AC_MSG_RESULT(yes) +#else +# SMPFLAG= +# AC_MSG_RESULT(no) +#fi + +CFLAGS="$KCFLAGS" +CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS " + +AC_SUBST(MOD_LINK) +AC_SUBST(LINUX25) \ No newline at end of file diff --git a/lustre/portals/autogen.sh b/lustre/portals/autogen.sh new file mode 100755 index 0000000..9deed73 --- /dev/null +++ b/lustre/portals/autogen.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +aclocal && +automake --add-missing && +${AUTOCONF:-autoconf} diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4 new file mode 100644 index 0000000..4e8dbbb --- /dev/null +++ b/lustre/portals/build.m4 @@ -0,0 +1,108 @@ + +# ---------- directories --------- + + +# --------- unsigned long long sane? ------- + +AC_CHECK_SIZEOF(unsigned long long, 0) +echo "---> size SIZEOF $SIZEOF_unsigned_long_long" +echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" +if test $ac_cv_sizeof_unsigned_long_long != 8 ; then + AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) +fi + +# directories for binaries +ac_default_prefix= +bindir='${exec_prefix}/usr/bin' +sbindir='${exec_prefix}/usr/sbin' +includedir='${prefix}/usr/include' + +# Directories for documentation and demos. +docdir='${prefix}/usr/share/doc/$(PACKAGE)' +AC_SUBST(docdir) +demodir='$(docdir)/demo' +AC_SUBST(demodir) +pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples' +AC_SUBST(pkgexampledir) +pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre' +AC_SUBST(pymoddir) +modulenetdir='$(moduledir)/net/$(PACKAGE)' +AC_SUBST(modulenetdir) + + +# ---------- BAD gcc? ------------ +AC_PROG_RANLIB +AC_PROG_CC +AC_MSG_CHECKING(for buggy compiler) +CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` +bad_cc() { + echo + echo " '$CC_VERSION'" + echo " has been known to generate bad code, " + echo " please get an updated compiler." + AC_MSG_ERROR(sorry) +} +TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` +if test "$TMP_VERSION" = "gcc version 2.95"; then + bad_cc +fi +case "$CC_VERSION" in + # ost_pack_niobuf putting 64bit NTOH temporaries on the stack + # without "sub $0xc,%esp" to protect the stack from being + # stomped on by interrupts (bug 606) + "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") + bad_cc + ;; + # mandrake's similar sub 0xc compiler bug + # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2 + "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") + bad_cc + ;; + *) + AC_MSG_RESULT(no known problems) + ;; +esac +# end ------ BAD gcc? ------------ + +# -------- Check for required packages -------------- + +# this doesn't seem to work on older autoconf +# AC_CHECK_LIB(readline, readline,,) +AC_ARG_ENABLE(readline, [ --enable-readline use readline library],, + enable_readline="yes") + +if test "$enable_readline" = "yes" ; then + LIBREADLINE="-lreadline -lncurses" + HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1" +else + LIBREADLINE="" + HAVE_LIBREADLINE="" +fi +AC_SUBST(LIBREADLINE) +AC_SUBST(HAVE_LIBREADLINE) + +AC_ARG_ENABLE(efence, [ --enable-efence use efence library],, + enable_efence="no") + +if test "$enable_efence" = "yes" ; then + LIBEFENCE="-lefence" + HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1" +else + LIBEFENCE="" + HAVE_LIBEFENCE="" +fi +AC_SUBST(LIBEFENCE) +AC_SUBST(HAVE_LIBEFENCE) + +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) +AC_MSG_CHECKING(if you are building lib lustre) +if test "$host_cpu" = "lib"; then + AC_MSG_RESULT(yes) + libdir='${exec_prefix}/lib/lustre' +else + AC_MSG_RESULT(no) +fi + +# end -------- Kernel build environment. ----------------- + + diff --git a/lustre/portals/configure.in b/lustre/portals/configure.in new file mode 100644 index 0000000..7c32246 --- /dev/null +++ b/lustre/portals/configure.in @@ -0,0 +1,38 @@ +# This version is here to make autoconf happy; the name is a file which is +# "unique" to this directory so that configure knows where it should run. +AC_INIT(knals/Makefile.am, 3.0) +AC_CANONICAL_SYSTEM +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +# Automake variables. Steal the version number from packaging/intersync.spec +AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c])) +# AM_MAINTAINER_MODE + +sinclude(archdep.m4) +sinclude(build.m4) +sincludemalloc() callback does not need to be called +for each object. + +The objects are maintained on a per-object type singly linked free +list and contain a pointer to the next free object. This pointer +is NULL if the object is not on the free list and is non-zero +if it is on the list. The special sentinal value of 0xDEADBEEF +is used to mark the end of the free list since NULL could +indicate that the last object in the list is not free. + +When one of the lib_*_alloc() functions is called, the library +returns the head of the free list and advances the head pointer +to the next item on the list. The special case of 0xDEADBEEF is +checked and a NULL pointer is returned if there are no more +objects of this type available. The lib_*_free() functions +are even simpler -- check to ensure that the object is not already +free, set its next pointer to the current head and then set +the head to be this newly freed object. + +Since C does not have templates, I did the next best thing and wrote +the memory pool allocation code as a macro that expands based on the +type of the argument. The mk_alloc(T) macro expands to +write the _lib_T_alloc() and lib_T_free() functions. +It requires that the object have a pointer of the type T named +"next_free". There are also functions that map _lib_T_alloc() +to lib_T_alloc() so that the library can add some extra +functionality to the T constructor. + + + +LINKED LISTS: +------------ + +Many of the active Portals objects are stored in doubly linked lists +when they are active. These are always implemented with the pointer +to the next object and a pointer to the next pointer of the +previous object. This avoids the "dummy head" object or +special cases for inserting at the beginning or end of the list. +The pointer manipulations are a little hairy at times, but +I hope that they are understandable. + +The actual linked list code is implemented as macros in , +although the object has to know about + + diff --git a/lustre/portals/doc/Makefile.am b/lustre/portals/doc/Makefile.am new file mode 100644 index 0000000..7c65e6c --- /dev/null +++ b/lustre/portals/doc/Makefile.am @@ -0,0 +1,46 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +LYX2PDF = lyx --export pdf +LYX2TXT = lyx --export text +LYX2HTML = lyx --export html +SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps + +DOCS = portals3.pdf +IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps +LYXFILES= portals3.lyx + +MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(GENERATED) +GENERATED = +EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) + +all: $(DOCS) + +# update date and version in document +date := $(shell date +%x) +tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') +addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g' + +# Regenerate when the $(VERSION) or $Name: $ changes. +.INTERMEDIATE: $(GENERATED) +$(GENERATED) : %.lyx: %.lin Makefile + $(addversion) $< > $@ + +.lyx.pdf: + @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" + +.lyx.txt: + @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" +.lyx.html: + @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" +.fig.eps: + -fig2dev -L eps $< > $@ + +portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx + +syncweb: portals3.pdf +# cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf +# ( cd /usr/src/www ; make lustre ; make synclustre ) + diff --git a/lustre/portals/doc/Message-life-cycle b/lustre/portals/doc/Message-life-cycle new file mode 100644 index 0000000..e8cc7e2 --- /dev/null +++ b/lustre/portals/doc/Message-life-cycle @@ -0,0 +1,118 @@ +This documents the life cycle of message as it arrives and is handled by +a basic async, packetized NAL. There are four types of messages that have +slightly different life cycles, so they are addressed independently. + + +Put request +----------- + +1. NAL notices that there is a incoming message header on the network +and reads an ptl_hdr_t in from the wire. + +2. It may store additional NAL specific data that provides context +for this event in a void* that it will interpret in some fashion +later. + +3. The NAL calls lib_parse() with a pointer to the header and its +private data structure. + +4. The library decodes the header and may build a message state +object that describes the event to be written and the ACK to be +sent, if any. It then calls nal->recv() with the private data +that the NAL passed in, a pointer to the message state object +and a translated user address. + + The NAL will have been given a chance to pretranslate + all user addresses when the buffers are created. This + process is described in the NAL-HOWTO. + +5. The NAL should restore what ever context it required from the +private data pointer, begin receiving the bytes and possibly store +some extra state of its own. It should return at this point. + + + +Get request +----------- + +1. As with a Put, the NAL notices the incoming message header and +passes it to lib_parse(). + +2. The library decodes the header and calls nal->recv() with a +zero byte length, offset and destination to instruct it to clean +up the wire after reading the header. The private data will +be passed in as well, allowing the NAL to retrieve any state +or context that it requires. + +3. The library may build a message state object to possibly +write an event log or invalidate a memory region. + +4. The library will build a ptl_msg_t header that specifies the +Portals protocol information for delivery at the remote end. + +5. The library calls nal->send() with the pre-built header, +the optional message state object, the four part address +component, a translated user pointer + offset, and some +other things. + +6. The NAL is to put the header on the wire or copy it at +this point (since it off the stack). It should store some +amount of state about its current position in the message and +the destination address. + +7. And then return to the library. + + +Reply request +------------- + +1. Starting at "The library decodes the header..." + +2. The library decodes the header and calls nal->recv() +to bring in the rest of the message. Flow continues in +exactly the same fashion as with all other receives. + + +Ack request +----------- + +1. The library decodes the header, builds the appropriate data +structures for the event in a message state object and calls nal->recv() +with a zero byte length, etc. + + +Packet arrival +-------------- + +1. The NAL should notice the arrival of a packet, retrieve whatever +state it needs from the message ID or other NAL specific header data +and place the data bytes directly into the user address that were +given to nal->recv(). + + How this happens is outside the scope of the Portals library + and soley determined by the NAL... + +2. If this is the last packet in a message, the NAL should retrieve +the lib_msg_t *cookie that it was given in the call to nal->recv() +and pass it to lib_finalize(). lib_finalize() may call nal->send() +to send an ACK, nal->write() to record an entry in the event log, +nal->invalidate() to unregister a region of memory or do nothing at all. + +3. It should then clean up any remaining NAL specific state about +the message and go back into the main loop. + + +Outgoing packets +---------------- + +1. When the NAL has pending output, it should put the packets on +the wire wrapped with whatever implementation specified wrappers. + +2. Once it has output all the packets of a message it should +call lib_finalize() with the message state object that was +handed to nal->send(). This will allows the library to clean +up its state regarding the message and write any pending event +entries. + + + diff --git a/lustre/portals/doc/NAL-HOWTO b/lustre/portals/doc/NAL-HOWTO new file mode 100644 index 0000000..ea38aed --- /dev/null +++ b/lustre/portals/doc/NAL-HOWTO @@ -0,0 +1,293 @@ +This document is a first attempt at describing how to write a NAL +for the Portals 3 library. It also defines the library architecture +and the abstraction of protection domains. + + +First, an overview of the architecture: + + Application + +----|----+-------- + | + API === NAL (User space) + | +---------+---|----- + | + LIB === NAL (Library space) + | +---------+---|----- + + Physical wire (NIC space) + + +Application + API +API-side NAL +------------ +LIB-side NAL + LIB +LIB-side NAL + wire + +Communication is through the indicated paths via well defined +interfaces. The API and LIB portions are written to be portable +across platforms and do not depend on the network interface. + +Communcation between the application and the API code is +defined in the Portals 3 API specification. This is the +user-visible portion of the interface and should be the most +stable. + + + +API-side NAL: +------------ + +The user space NAL needs to implement only a few functions +that are stored in a nal_t data structure and called by the +API-side library: + + int forward( nal_t *nal, + int index, + void *args, + size_t arg_len, + void *ret, + size_t ret_len + ); + +Most of the data structures in the portals library are held in +the LIB section of the code, so it is necessary to forward API +calls across the protection domain to the library. This is +handled by the NAL's forward method. Once the argument and return +blocks are on the remote side the NAL should call lib_dispatch() +to invoke the appropriate API function. + + int validate( nal_t *nal, + void *base, + size_t extent, + void **trans_base, + void **trans_data + ); + +The validate method provides a means for the NAL to prevalidate +and possibly pretranslate user addresses into a form suitable +for fast use by the network card or kernel module. The trans_base +pointer will be used by the library everytime it needs to +refer to the block of memory. The trans_data result is a +cookie that will be handed to the NAL along with the trans_base. + +The library never performs calculations on the trans_base value; +it only computes offsets that are then handed to the NAL. + + + int shutdown( nal_t *nal, int interface ); + +Brings down the network interface. The remote NAL side should +call lib_fini() to bring down the library side of the network. + + void yield( nal_t *nal ); + +This allows the user application to gracefully give up the processor +while busy waiting. Performance critical applications may not +want to take the time to call this function, so it should be an +option to the PtlEQWait call. Right now it is not implemented as such. + +Lastly, the NAL must implement a function named PTL_IFACE_*, where +* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR. +This initialization function is to set up communication with the +library-side NAL, which should call lib_init() to bring up the +network interface. + + + +LIB-side NAL: +------------ + +On the library-side, the NAL has much more responsibility. It +is responsible for calling lib_dispatch() on behalf of the user, +it is also responsible for bringing packets off the wire and +pushing bits out. As on the user side, the methods are stored +in a nal_cb_t structure that is defined on a per network +interface basis. + +The calls to lib_dispatch() need to be examined. The prototype: + + void lib_dispatch( + nal_cb_t *nal, + void *private, + int index, + void *arg_block, + void *ret_block + ); + +has two complications. The private field is a NAL-specific +value that will be passed to any callbacks produced as a result +of this API call. Kernel module implementations may use this +for task structures, or perhaps network card data. It is ignored +by the library. + +Secondly, the arg_block and ret_block must be in the same protection +domain as the library. The NAL's two halves must communicate the +sizes and perform the copies. After the call, the buffer pointed +to by ret_block will be filled in and should be copied back to +the user space. How this is to be done is NAL specific. + + int lib_parse( + nal_cb_t *nal, + ptl_hdr_t *hdr, + void *private + ); + +This is the only other entry point into the library from the NAL. +When the NAL detects an incoming message on the wire it should read +sizeof(ptl_hdr_t) bytes and pass a pointer to the header to +lib_parse(). It may set private to be anything that it needs to +tie the incoming message to callbacks that are made as a result +of this event. + +The method calls are: + + int (*send)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int nid, + int pid, + int gid, + int rid, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t len + ); + +This is a tricky function -- it must support async output +of messages as well as properly syncronized event log writing. +The private field is the same that was passed into lib_dispatch() +or lib_parse() and may be used to tie this call to the event +that initiated the entry to the library. + +The cookie is a pointer to a library private value that must +be passed to lib_finalize() once the message has been completely +sent. It should not be examined by the NAL for any meaning. + +The four ID fields are passed in, although some implementations +may not use all of them. + +The single base pointer has been replaced with the translated +address that the API NAL generated in the api_nal->validate() +call. The trans_data is unchanged and the offset is in bytes. + + + int (*recv)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t mlen, + size_t rlen + ); + +This callback will only be called in response to lib_parse(). +The cookie, trans_addr and trans_data are as discussed in send(). +The NAL should read mlen bytes from the wire, deposit them into +trans_base + offset and then discard (rlen - mlen) bytes. +Once the entire message has been received the NAL should call +lib_finalize() with the lib_msg_t *cookie. + +The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0 +is used to indicate that the NAL should clean up the wire. This could +be implemented as a blocking call, although having it return as quickly +as possible is desirable. + + int (*write)( + nal_cb_t *nal, + void *private, + user_ptr trans_addr, + user_ptr trans_data, + size_t offset, + + void *src_addr, + size_t len + ); + +This is essentially a cross-protection domain memcpy(). The user address +has been pretranslated by the api_nal->translate() call. + + void *(*malloc)( + nal_cb_t *nal, + size_t len + ); + + void (*free)( + nal_cb_t *nal, + void *buf + ); + +Since the NAL may be in a non-standard hosted environment it can +not call malloc(). This allows the library side NAL to implement +the system specific malloc(). In the current reference implementation +the libary only calls nal->malloc() when the network interface is +initialized and then calls free when it is brought down. The library +maintains its own pool of objects for allocation so only one call to +malloc is made per object type. + + void (*invalidate)( + nal_cb_t *nal, + user_ptr trans_base, + user_ptr trans_data, + size_t extent + ); + +User addresses are validated/translated at the user-level API NAL +method, which is likely to push them to this level. Meanwhile, +the library NAL will be notified when the library no longer +needs the buffer. Overlapped buffers are not detected by the +library, so the NAL should ref count each page involved. + +Unfortunately we have a few bugs when the invalidate method is +called. It is still in progress... + + void (*printf)( + nal_cb_t *nal, + const char *fmt, + ... + ); + +As with malloc(), the library does not have any way to do printf +or printk. It is not necessary for the NAL to implement the this +call, although it will make debugging difficult. + + void (*cli)( + nal_cb_t *nal, + unsigned long *flags + ); + + void (*sti)( + nal_cb_t *nal, + unsigned long *flags + ); + +These are used by the library to mark critical sections. + + int (*gidrid2nidpid)( + nal_cb_t *nal, + ptl_id_t gid, + ptl_id_t rid, + ptl_id_t *nid, + ptl_id_t *pid + ); + + + int (*nidpid2gidrid)( + nal_cb_t *nal, + ptl_id_t nid, + ptl_id_t pid, + ptl_id_t *gid, + ptl_id_t *rid + ); + +Rolf added these. I haven't looked at how they have to work yet. diff --git a/lustre/portals/doc/file.fig b/lustre/portals/doc/file.fig new file mode 100644 index 0000000..914c294 --- /dev/null +++ b/lustre/portals/doc/file.fig @@ -0,0 +1,111 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1200 750 1650 1050 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1050 1650 750 1200 750 1200 1050 1650 1050 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001 +-6 +6 1200 2325 1650 2625 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2625 1650 2325 1200 2325 1200 2625 1650 2625 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001 +-6 +6 1200 1800 1650 2100 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2100 1650 1800 1200 1800 1200 2100 1650 2100 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001 +-6 +6 1200 1275 1650 1575 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1575 1650 1275 1200 1275 1200 1575 1650 1575 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001 +-6 +6 450 750 900 1200 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 825 450 1050 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1050 900 825 +-6 +6 450 2325 900 2775 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 2400 450 2625 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2625 900 2400 +-6 +6 450 1800 900 2250 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1875 450 2100 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2100 900 1875 +-6 +6 450 1275 900 1725 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1350 450 1575 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1575 900 1350 +-6 +6 2250 750 3450 2625 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1200 3150 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1500 3150 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1800 3150 1800 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2100 3150 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 975 3150 975 3150 2625 2550 2625 2550 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2400 3150 2400 +4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2400 2550 1350 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1875 2550 1050 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1425 2550 1950 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 900 2550 1650 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 900 1200 900 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1425 1200 1425 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1950 1200 1950 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2475 1200 2475 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2025 2550 2250 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2550 2550 2475 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1875 2850 1875 600 225 600 225 2850 1875 2850 +4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001 diff --git a/lustre/portals/doc/flow_new.fig b/lustre/portals/doc/flow_new.fig new file mode 100644 index 0000000..d828dea --- /dev/null +++ b/lustre/portals/doc/flow_new.fig @@ -0,0 +1,213 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 525 2175 1575 2925 +6 675 2287 1425 2812 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001 +4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 2550 1050 2175 525 2550 1050 2925 1575 2550 +-6 +6 3450 1275 4350 1725 +6 3600 1312 4200 1687 +4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001 +4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3450 1275 4350 1275 4350 1725 3450 1725 3450 1275 +-6 +6 4650 1275 5550 1725 +6 4725 1312 5475 1687 +4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001 +4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4650 1275 5550 1275 5550 1725 4650 1725 4650 1275 +-6 +6 1350 525 2250 975 +6 1350 562 2250 937 +4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001 +4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 525 2250 525 2250 975 1350 975 1350 525 +-6 +6 525 1125 1575 1875 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 1500 1050 1125 525 1500 1050 1875 1575 1500 +4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001 +-6 +6 2340 1237 2940 1687 +6 2340 1237 2940 1687 +4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001 +4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001 +4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001 +-6 +-6 +6 525 3225 1575 3975 +6 675 3375 1425 3750 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 525 3600 1050 3225 1575 3600 1050 3975 525 3600 +-6 +6 3300 3375 4350 3825 +6 3300 3412 4350 3787 +4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3300 3375 4350 3375 4350 3825 3300 3825 3300 3375 +-6 +6 1950 3225 3000 3975 +6 2250 3450 2700 3750 +4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 3000 3600 2475 3225 1950 3600 2475 3975 3000 3600 +-6 +6 3150 4500 4200 4950 +6 3150 4537 4200 4912 +4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3150 4500 4200 4500 4200 4950 3150 4950 3150 4500 +-6 +6 600 4500 1500 4950 +6 675 4537 1425 4912 +4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001 +4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 600 4500 1500 4500 1500 4950 600 4950 600 4500 +-6 +6 4650 4350 5700 5100 +6 4950 4537 5400 4912 +6 4950 4537 5400 4912 +4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001 +4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001 +-6 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 5700 4725 5175 4350 4650 4725 5175 5100 5700 4725 +-6 +6 6000 4500 6900 4950 +6 6225 4575 6675 4875 +4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001 +4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 6000 4500 6900 4500 6900 4950 6000 4950 6000 4500 +-6 +6 1800 4350 2850 5100 +6 2100 4575 2550 4875 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 2850 4725 2325 4350 1800 4725 2325 5100 2850 4725 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 1875 1050 2175 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 1500 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 450 1050 1125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1350 750 1050 750 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 2925 1050 3225 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3150 1500 3450 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4350 1500 4650 1500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2100 1500 2625 1125 3150 1500 2625 1875 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 3600 1950 3600 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 3975 1050 4500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 3600 3300 3600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 4725 1800 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 5700 4725 6000 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2850 4725 3150 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4200 4725 4650 4725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 6900 4725 7950 4725 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1575 2550 1650 2550 1800 2550 1800 2400 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 2250 750 2475 750 2625 750 2625 900 2625 1125 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 7500 4725 7500 1650 7500 1500 7350 1500 5550 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 2475 3225 2475 2400 2475 2250 2325 2250 1800 2250 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 3825 3375 3825 2175 3825 2025 3675 2025 1800 2025 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125 + 4425 4275 4425 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125 + 7275 4275 7275 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001 +4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001 diff --git a/lustre/portals/doc/get.fig b/lustre/portals/doc/get.fig new file mode 100644 index 0000000..28db949 --- /dev/null +++ b/lustre/portals/doc/get.fig @@ -0,0 +1,33 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 2775 900 3525 1200 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001 +-6 +6 1350 1725 2175 2025 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 750 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 825 2700 1275 +2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1350 900 1950 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 diff --git a/lustre/portals/doc/ieee.bst b/lustre/portals/doc/ieee.bst new file mode 100644 index 0000000..5367caa --- /dev/null +++ b/lustre/portals/doc/ieee.bst @@ -0,0 +1,1114 @@ +% --------------------------------------------------------------- +% +% $Id: ieee.bst,v 1.1.2.1 2003/05/19 04:25:30 braam Exp $ +% +% by Paolo.Ienne@di.epfl.ch +% +% --------------------------------------------------------------- +% +% no guarantee is given that the format corresponds perfectly to +% IEEE 8.5" x 11" Proceedings, but most features should be ok. +% +% --------------------------------------------------------------- +% +% `ieee' from BibTeX standard bibliography style `abbrv' +% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. +% Copyright (C) 1985, all rights reserved. +% Copying of this file is authorized only if either +% (1) you make absolutely no changes to your copy, including name, or +% (2) if you do make changes, you name it something other than +% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. +% This restriction helps ensure that all standard styles are identical. +% The file btxbst.doc has the documentation for this style. + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ } + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { month empty$ + { "" } + { "there's a month but no year in " cite$ * warning$ + month + } + if$ + } + { month empty$ + 'year + { month " " * year * } + if$ + } + if$ +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":" * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { "In " booktitle emphasize * } + { "In " format.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + key empty$ not and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In {\em " booktitle * "\/}" * } + if$ + } + { "In " key * } + if$ + } + { "In " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + format.date "year" output.check + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.btitle "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization output + address output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title howpublished new.block.checkb + format.title output + howpublished new.block.checka + howpublished output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.btitle "title" output.check + new.block + "PhD thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors output.nonnull } + if$ + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address empty$ + { editor empty$ + { publisher new.sentence.checka } + { organization publisher new.sentence.checkb + organization output + } + if$ + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + editor empty$ + 'skip$ + { organization output } + if$ + publisher output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.organization.sort} +{ author empty$ + { organization empty$ + { key empty$ + { "to sort, need author, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.organization.sort} +{ editor empty$ + { organization empty$ + { key empty$ + { "to sort, need editor, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.organization.sort + { type$ "manual" = + 'author.organization.sort + 'author.sort + if$ + } + if$ + } + if$ + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * + "}\setlength{\itemsep}{-1ex}\small" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} + +% end of file ieee.bst +% --------------------------------------------------------------- diff --git a/lustre/portals/doc/mpi.fig b/lustre/portals/doc/mpi.fig new file mode 100644 index 0000000..e1a91b5 --- /dev/null +++ b/lustre/portals/doc/mpi.fig @@ -0,0 +1,117 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 150 1650 900 2025 +4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001 +4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001 +-6 +6 150 150 900 525 +4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001 +4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001 +-6 +6 2550 4125 3150 4725 +4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001 +4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001 +4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001 +-6 +6 1050 1575 1950 1875 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 1575 1950 1575 1950 1875 1050 1875 1050 1575 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001 +-6 +6 5400 1575 6300 2175 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 1575 6300 1575 6300 2175 5400 2175 5400 1575 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001 +-6 +6 5400 2400 6300 3000 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 2400 6300 2400 6300 3000 5400 3000 5400 2400 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001 +-6 +6 1050 2400 1950 2700 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 2400 1950 2400 1950 2700 1050 2700 1050 2400 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001 +-6 +6 1050 825 1950 1125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 825 1950 825 1950 1125 1050 1125 1050 825 +4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1575 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2025 4050 3375 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 675 6600 675 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 1350 6600 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 4125 3300 4125 3300 4725 2400 4725 2400 4125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 4500 4050 3675 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 1725 5400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2550 5400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2850 4050 3450 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1800 1500 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 825 3300 825 3300 1275 2400 1275 2400 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 2625 1500 4125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 4125 1950 4125 1950 4425 1050 4425 1050 4125 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 300 1500 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 975 2400 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 1725 2400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 2550 2400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 4275 2400 4275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 1575 3300 1575 3300 2175 2400 2175 2400 1575 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 2400 3300 2400 3300 3000 2400 3000 2400 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4050 3300 5250 3300 5250 3750 4050 3750 4050 3300 +4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001 +4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001 +4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001 +4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001 +4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001 +4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001 +4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001 diff --git a/lustre/portals/doc/portals.fig b/lustre/portals/doc/portals.fig new file mode 100644 index 0000000..9b1271b --- /dev/null +++ b/lustre/portals/doc/portals.fig @@ -0,0 +1,68 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 900 1650 900 1650 1200 1350 1200 1350 900 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1800 1350 2100 1350 2100 1650 1800 1650 1800 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2250 1800 2550 1800 2550 2100 2250 2100 2250 1800 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 4200 375 4200 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 525 600 1125 600 1125 2100 525 2100 525 600 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4425 1275 4875 1275 4875 1950 4425 1950 4425 1275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 1200 3150 1200 3150 1500 2550 1500 2550 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 1425 4425 1425 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3600 825 3750 825 3750 1125 3600 1125 3600 825 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2025 1425 2550 1425 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 4425 750 4875 750 4875 1125 4425 1125 4425 750 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3675 975 4425 975 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2 + 0 0 1.00 60.00 120.00 + 825 1050 1350 1050 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1350 1500 1500 1650 1500 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1950 1575 1950 1800 1950 1950 2100 1950 2250 1950 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 975 1125 975 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 1125 1125 1125 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7 + 0 0 1.00 60.00 120.00 + 3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975 + 3600 975 + 0.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001 +4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001 +4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001 +4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001 +4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001 +4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001 +4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001 diff --git a/lustre/portals/doc/portals3.bib b/lustre/portals/doc/portals3.bib new file mode 100644 index 0000000..323b99f --- /dev/null +++ b/lustre/portals/doc/portals3.bib @@ -0,0 +1,124 @@ +@Article{ Cplant, + title = { {M}assively {P}arallel {C}omputing with + {C}ommodity {C}omponents }, + author = { Ron Brightwell and David S. Greenberg and Arthur + B. Maccabe and Rolf Riesen }, + journal = { Parallel Computing }, + volume = { 26 }, + month = { February }, + pages = { 243-266 }, + year = { 2000 } +} + +@Manual{ Portals, + organization = { Sandia National Laboratories }, + title = { {P}uma {P}ortals }, + note = { http://www.cs.sandia.gov/puma/portals }, + year = { 1997 } +} + +@Techreport{ VIA, + title = { {V}irtual {I}nterface {A}rchitecture + {S}pecification {V}ersion 1.0 }, + author = { {Compaq, Microsoft, and Intel} }, + institution = { Compaq, Microsoft, and Intel }, + month = { December }, + year = { 1997 } +} + +@Techreport{ ST, + title = { {I}nformation {T}echnology - {S}cheduled + {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 }, + author = { {Task Group of Technical Committee T11} }, + institution = { Accredited Standards Committee NCITS }, + month = { July }, + year = { 1998 } +} + +@Manual{ TFLOPS, + organization = { Sandia National Laboratories }, + title = { ASCI Red }, + note = { http://www.sandia.gov/ASCI/TFLOP }, + year = { 1996 } +} + +@Techreport{ GM, + title = { The {GM} {M}essage {P}assing {S}ystem }, + author = { {Myricom, Inc.} }, + institution = { {Myricom, Inc.} }, + year = { 1997 }, +} + +@Article{ MPIstandard, + title = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard }, + author = { {Message Passing Interface Forum} }, + journal = { The International Journal of Supercomputer Applications + and High Performance Computing }, + volume = { 8 }, + year = { 1994 } +} + +@Inproceedings{ PumaOS, + author = "Lance Shuler and Chu Jong and Rolf Riesen and + David van Dresser and Arthur B. Maccabe and + Lee Ann Fisk and T. Mack Stallcup", + booktitle = "Proceeding of the 1995 Intel Supercomputer + User's Group Conference", + title = "The {P}uma Operating System for Massively Parallel Computers", + organization = "Intel Supercomputer User's Group", + year = 1995 +} + +@InProceedings{ SUNMOS, +author = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and + Stephen R. Wheat", +title = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide", +booktitle = "Proceedings of the {Intel} Supercomputer Users' Group. 1994 + Annual North America Users' Conference.", +year = 1994, +pages = "245--251", +month = "June", +location = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps" +} + +@InProceedings { PumaMPI, + title = { Design and Implementation of {MPI} on {P}uma Portals }, + author = { Ron Brightwell and Lance Shuler }, + booktitle = { Proceedings of the Second MPI Developer's Conference }, + pages = { 18-25 }, + month = { July }, + year = { 1996 } +} + +@Inproceedings{ FM2, + author = { Mario Lauria and Scott Pakin and Andrew Chien }, + title = { {E}fficient {L}ayering for {H}igh {S}peed + {C}ommunication: {F}ast {M}essages 2.x }, + Booktitle = { Proceedings of the IEEE International Symposium + on High Performance Distributed Computing }, + year = { 1998 } +} + +@Manual { CraySHMEM, + title = "SHMEM Technical Note for C, SG-2516 2.3", + organization = "Cray Research, Inc.", + month = "October", + year = 1994 +} + +@Manual { MPI2, + title = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface", + organization = "Message Passing Interface Forum", + note = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html", + month = "July", + year = 1997 +} + +@InProceedings { PMMPI, + title = { {The Design and Implementation of Zero Copy MPI Using + Commodity Hardware with a High Performance Network} }, + author = { Francis O'Carroll and Hiroshi Tezuka and Atsushi Hori + and Yutaka Ishikawa }, + booktitle = { Proceedings of the ICS }, + year = { 1998 } +} diff --git a/lustre/portals/doc/portals3.lyx b/lustre/portals/doc/portals3.lyx new file mode 100644 index 0000000..f3c24e0 --- /dev/null +++ b/lustre/portals/doc/portals3.lyx @@ -0,0 +1,15946 @@ +#LyX 1.2 created this file. For more info see http://www.lyx.org/ +\lyxformat 220 +\textclass report +\begin_preamble +\usepackage{fullpage} +\renewenvironment{comment}% +{\begin{quote}\textbf{Discussion}: \slshape}% +{\end{quote}} +\pagestyle{myheadings} +\markboth{$Revision: 1.1.2.1 $\hfil$Date: 2003/05/19 04:25:30 $}% +{$Date: 2003/05/19 04:25:30 $\hfil$Revision: 1.1.2.1 $} +\end_preamble +\language american +\inputencoding auto +\fontscheme pslatex +\graphics default +\paperfontsize 10 +\spacing single +\papersize letterpaper +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 2 +\tocdepth 2 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 2 +\paperpagestyle headings + +\layout Title + +The Portals 3.2 Message Passing Interface +\newline + Revision 1.1 +\layout Author + +Ron Brightwell +\begin_inset Foot +collapsed true + +\layout Standard + +R. + Brightwell and R. + Riesen are with the Scalable Computing Systems Department, Sandia National + Laboratories, P.O. + Box 5800, Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov. +\end_inset + +, Arthur B. + Maccabe +\begin_inset Foot +collapsed true + +\layout Standard + +A. + B. + Maccabe is with the Computer Science Department, University of New Mexico, + Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87131-1386, maccabe@cs.unm.edu. +\end_inset + +, Rolf Riesen and Trammell Hudson +\layout Abstract + +This report presents a specification for the Portals 3.2 message passing + interface. + Portals 3.2 is intended to allow scalable, high-performance network communicatio +n between nodes of a parallel computing system. + Specifically, it is designed to support a parallel computing platform composed + of clusters of commodity workstations connected by a commodity system area + network fabric. + In addition, Portals 3.2 is well suited to massively parallel processing + and embedded systems. + Portals 3.2 represents an adaption of the data movement layer developed + for massively parallel processing platforms, such as the 4500-node Intel + TeraFLOPS machine. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +clearpage +\backslash +pagenumbering{roman} +\backslash +setcounter{page}{3} +\end_inset + + +\layout Standard + + +\begin_inset LatexCommand \tableofcontents{} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList figure + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList table + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Chapter* + +Summary of Changes for Revision 1.1 +\layout Enumerate + +Updated version number to 3.2 throughout the document +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sub:PtlGetId} + +\end_inset + +: added +\family typewriter +PTL_SEGV +\family default + to error list for +\shape italic +PtlGetId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: added +\family typewriter +PTL_ML_TOOLONG +\family default + to error list for +\shape italic +PtlMEAttach +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meunlink} + +\end_inset + +: removed text referring to a list of associated memory descriptors. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added text to describe unlinking a free-floating memory descriptor. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added entry for +\family typewriter +ptl_seq_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added definition of +\family typewriter +max_offset +\family default +. +\layout Enumerate + +added text to clarify +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default +. +\end_deeper +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: modified text for +\family typewriter +unlink_op +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: added text to clarify multiple calls to +\shape italic +PtlNIInit +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: added text to clarify +\family typewriter +unlink_nofit +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:receiving} + +\end_inset + +: removed text indicating that an MD will reject a message if the associated + EQ is full. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + error code and text to indicate that only MDs with no pending operations + can be unlinked. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + return code. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added user id field, MD handle field, and NI specific failure field to + the +\family typewriter +ptl_event_t +\family default + structure. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added +\family typewriter +ptl_ni_fail_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added +\family typewriter +PTL_EVENT_UNLINK +\family default + event type. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: removed +\shape slanted +PtlTransId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +: listed allowable constants with relevant fields. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: added +\shape italic +PtlMEAttachAny +\shape default + function. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_PT_FULL +\family default + return code for +\shape italic +PtlMEAttachAny +\shape default +. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + +: updated to reflect new event types. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: added +\family typewriter +ptl_nid_t +\family default +, +\family typewriter +ptl_pid_t +\family default +, and +\family typewriter +ptl_uid_t +\family default +. +\layout Chapter* + +Summary of Changes for Version 3.1 +\layout Section* + +Thread Issues +\layout Standard + +The most significant change to the interface from version 3.0 to 3.1 involves + the clarification of how the interface interacts with multi-threaded applicatio +ns. + We adopted a generic thread model in which processes define an address + space and threads share the address space. + Consideration of the API in the light of threads lead to several clarifications + throughout the document: +\layout Enumerate + +Glossary: +\begin_deeper +\layout Enumerate + +added a definition for +\emph on +thread +\emph default +, +\layout Enumerate + +reworded the definition for +\emph on +process +\emph default +. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +: added section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:threads} + +\end_inset + + to describe the multi-threading model used by the Portals API. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlinit} + +\end_inset + +: +\emph on +PtlInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlfini} + +\end_inset + +: +\emph on +PtlFini +\emph default + should be called once as the process is terminating and not as each thread + terminates. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +: Portals does not define thread ids. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + +: network interfaces are associated with processes, not threads. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: +\emph on +PtlNIInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqget} + +\end_inset + +: +\emph on +PtlEQGet +\emph default + returns +\family typewriter +PTL_EQ_EMPTY +\family default + if a thread is blocked on +\emph on +PtlEQWait +\emph default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqwait} + +\end_inset + +: waiting threads are awakened in FIFO order. + +\layout Standard + +Two functions, +\emph on +PtlNIBarrier +\emph default + and +\emph on +PtlEQCount +\emph default + were removed from the API. + +\emph on +PtlNIBarrier +\emph default + was defined to block the calling process until all of the processes in + the application group had invoked +\emph on +PtlNIBarrier +\emph default +. + We now consider this functionality, along with the concept of groups (see + the discussion under +\begin_inset Quotes eld +\end_inset + +other changes +\begin_inset Quotes erd +\end_inset + +), to be part of the runtime system, not part of the Portals API. + +\emph on +PtlEQCount +\emph default + was defined to return the number of events in an event queue. + Because external operations may lead to new events being added and other + threads may remove events, the value returned by +\emph on +PtlEQCount +\emph default + would have to be a hint about the number of events in the event queue. +\layout Section* + +Handling small, unexpected messages +\layout Standard + +Another set of changes relates to handling small unexpected messages in + MPI. + In designing version 3.0, we assumed that each unexpected message would + be placed in a unique memory descriptor. + To avoid the need to process a long list of memory descriptors, we moved + the memory descriptors out of the match list and hung them off of a single + match list entry. + In this way, large unexpected messages would only encounter a single +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + match list entry before encountering the +\begin_inset Quotes eld +\end_inset + +long message +\begin_inset Quotes erd +\end_inset + + match list entry. + Experience with this strategy identified resource management problems with + this approach. + In particular, a long sequence of very short (or zero length) messages + could quickly exhaust the memory descriptors constructed for handling unexpecte +d messages. + Our new strategy involves the use of several very large memory descriptors + for small unexpected messages. + Consecutive unexpected messages will be written into the first of these + memory descriptors until the memory descriptor fills up. + When the first of the +\begin_inset Quotes eld +\end_inset + +small memory +\begin_inset Quotes erd +\end_inset + + descriptors fills up, it will be unlinked and subsequent short messages + will be written into the next +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor. + In this case, a +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor will be declared full when it does not have sufficient + space for the largest small unexpected message. +\layout Standard + +This lead to two significant changes. + First, each match list entry now has a single memory descriptor rather + than a list of memory descriptors. + Second, in addition to exceeding the operation threshold, a memory descriptor + can be unlinked when the local offset exceeds a specified value. + These changes have lead to several changes in this document: +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{subsec:paddress} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed references to the memory descriptor list, +\layout Enumerate + +changed the portals address translation description to indicate that unlinking + a memory descriptor implies unlinking the associated match list entry--match + list entries can no longer be unlinked independently from the memory descriptor. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed unlink from argument list, +\layout Enumerate + +removed description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +changed wording of the error condition when the Portal table index already + has an associated match list. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +: removed unlink from argument list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: added +\family typewriter +max_offset +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +removed reference to memory descriptor lists, +\layout Enumerate + +changed wording of the error condition when match list entry already has + an associated memory descriptor, +\layout Enumerate + +changed the description of the +\family typewriter +unlink +\family default + argument. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +: removed +\family typewriter +PtlMDInsert +\family default + operation. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: removed references to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: removed references to PtlMDInsert. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + +: revised the MPI example to reflect the changes to the interface. + +\layout Standard + +Several changes have been made to improve the general documentation of the + interface. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_EQ_NONE +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_ID_ANY +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: documented the return value +\family typewriter +PTL_INV_EQ +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + +: clarified the description of the +\emph on +PtlMDUpdate +\emph default + function. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:implvals} + +\end_inset + +: introduced a new section to document the implementation defined values. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: modified Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + to indicate where each constant is introduced and where it is used. + +\layout Section* + +Other changes +\layout Subsection* + +Implementation defined limits (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +The earlier version provided implementation defined limits for the maximum + number of match entries, the maximum number of memory descriptors, etc. + Rather than spanning the entire implementation, these limits are now associated + with individual network interfaces. +\layout Subsection* + +Added User Ids (Section +\begin_inset LatexCommand \ref{sec:uid} + +\end_inset + +) +\layout Standard + +Group Ids had been used to simplify access control entries. + In particular, a process could allow access for all of the processes in + a group. + User Ids have been introduced to regain this functionality. + We use user ids to fill this role. +\layout Subsection* + +Removed Group Ids and Rank Ids (Section +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +) +\layout Standard + +The earlier version of Portals had two forms for addressing processes: and . + A process group was defined as the collection processes created during + application launch. + Each process in the group was given a unique rank id in the range 0 to + +\begin_inset Formula $n-1$ +\end_inset + + where +\begin_inset Formula $n$ +\end_inset + + was the number of processes in the group. + We removed groups because they are better handled in the runtime system. +\layout Subsection* + +Match lists (Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +) +\layout Standard + +It is no longer illegal to have an existing match entry when calling PtlMEAttach. + A position argument was added to the list of arguments supplied to +\emph on +PtlMEAttach +\emph default + to specify whether the new match entry is prepended or appended to the + existing list. + If there is no existing match list, the position argument is ignored. +\layout Subsection* + +Unlinking Memory Descriptors (Section +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +) +\layout Standard + +Previously, a memory descriptor could be unlinked if the offset exceeded + a threshold upon the completion of an operation. + In this version, the unlinking is delayed until there is a matching operation + which requires more memory than is currently available in the descriptor. + In addition to changes in section, this lead to a revision of Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + +. +\layout Subsection* + +Split Phase Operations and Events (Section +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + +) +\layout Standard + +Previously, there were five types of events: +\family typewriter +PTL_EVENT_PUT +\family default +, +\family typewriter +PTL_EVENT_GET +\family default +, +\family typewriter +PTL_EVENT_REPLY +\family default +, +\family typewriter +PTL_EVENT_SENT +\family default +, and +\family typewriter +PTL_EVENT_ACK. + +\family default +The first four of these reflected the completion of potentially long operations. + We have introduced new event types to reflect the fact that long operations + have a distinct starting point and a distinct completion point. + Moreover, the completion may be successful or unsuccessful. +\layout Standard + +In addition to providing a mechanism for reporting failure to higher levels + of software, this split provides an opportunity for for improved ordering + semantics. + Previously, if one process intiated two operations (e.g., two put operations) + on a remote process, these operations were guaranteed to complete in the + same order that they were initiated. + Now, we only guarantee that the initiation events are delivered in the + same order. + In particular, the operations do not need to complete in the order that + they were intiated. +\layout Subsection* + +Well known proces ids (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +To support the notion of +\begin_inset Quotes eld +\end_inset + +well known process ids, +\begin_inset Quotes erd +\end_inset + + we added a process id argument to the arguments for PtlNIInit. +\layout Chapter* + +Glossary +\layout Description + +API Application Programming Interface. + A definition of the functions and semantics provided by library of functions. + +\layout Description + +Initiator A +\emph on +process +\emph default + that initiates a message operation. + +\layout Description + +Message An application-defined unit of data that is exchanged between +\emph on +processes +\emph default +. + +\layout Description + +Message\SpecialChar ~ +Operation Either a put operation, which writes data, or a get operation, + which reads data. + +\layout Description + +Network A network provides point-to-point communication between +\emph on +nodes +\emph default +. + Internally, a network may provide multiple routes between endpoints (to + improve fault tolerance or to improve performance characteristics); however, + multiple paths will not be exposed outside of the network. + +\layout Description + +Node A node is an endpoint in a +\emph on +network +\emph default +. + Nodes provide processing capabilities and memory. + A node may provide multiple processors (an SMP node) or it may act as a + +\emph on +gateway +\emph default + between networks. + +\layout Description + +Process A context of execution. + A process defines a virtual memory (VM) context. + This context is not shared with other processes. + Several threads may share the VM context defined by a process. + +\layout Description + +Target A +\emph on +process +\emph default + that is acted upon by a message operation. + +\layout Description + +Thread A context of execution that shares a VM context with other threads. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\layout Standard + +\backslash +setcounter{page}{1} +\backslash +pagenumbering{arabic} +\end_inset + + +\layout Chapter + +Introduction +\begin_inset LatexCommand \label{sec:intro} + +\end_inset + + +\layout Section + +Overview +\layout Standard + +This document describes an application programming interface for message + passing between nodes in a system area network. + The goal of this interface is to improve the scalability and performance + of network communication by defining the functions and semantics of message + passing required for scaling a parallel computing system to ten thousand + nodes. + This goal is achieved by providing an interface that will allow a quality + implementation to take advantage of the inherently scalable design of Portals. +\layout Standard + +This document is divided into several sections: +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:intro} + +\end_inset + +---Introduction This section describes the purpose and scope of the Portals + API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +---An\SpecialChar ~ +Overview\SpecialChar ~ +of\SpecialChar ~ +the\SpecialChar ~ +Portals\SpecialChar ~ +3.1\SpecialChar ~ +API This section gives a brief overview of the + Portals API. + The goal is to introduce the key concepts and terminology used in the descripti +on of the API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:api} + +\end_inset + +---The\SpecialChar ~ +Portals\SpecialChar ~ +3.2\SpecialChar ~ +API This section describes the functions and semantics of + the Portals application programming interface. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +--The\SpecialChar ~ +Semantics\SpecialChar ~ +of\SpecialChar ~ +Message\SpecialChar ~ +Transmission This section describes the semantics + of message transmission. + In particular, the information transmitted in each type of message and + the processing of incoming messages. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:examples} + +\end_inset + +---Examples This section presents several examples intended to illustrates + the use of the Portals API. + +\layout Section + +Purpose +\layout Standard + +Existing message passing technologies available for commodity cluster networking + hardware do not meet the scalability goals required by the Cplant\SpecialChar ~ + +\begin_inset LatexCommand \cite{Cplant} + +\end_inset + + project at Sandia National Laboratories. + The goal of the Cplant project is to construct a commodity cluster that + can scale to the order of ten thousand nodes. + This number greatly exceeds the capacity for which existing message passing + technologies have been designed and implemented. +\layout Standard + +In addition to the scalability requirements of the network, these technologies + must also be able to support a scalable implementation of the Message Passing + Interface (MPI)\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPIstandard} + +\end_inset + + standard, which has become the +\shape italic +de facto +\shape default + standard for parallel scientific computing. + While MPI does not impose any scalability limitations, existing message + passing technologies do not provide the functionality needed to allow implement +ations of MPI to meet the scalability requirements of Cplant. +\layout Standard + +The following are properties of a network architecture that do not impose + any inherent scalability limitations: +\layout Itemize + +Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + + and TCP/IP sockets, have limitations on the number of peer connections + that can be established. + +\layout Itemize + +Network independence - Many communication systems depend on the host processor + to perform operations in order for messages in the network to be consumed. + Message consumption from the network should not be dependent on host processor + activity, such as the operating system scheduler or user-level thread scheduler. + +\layout Itemize + +User-level flow control - Many communication systems manage flow control + internally to avoid depleting resources, which can significantly impact + performance as the number of communicating processes increases. + +\layout Itemize + +OS Bypass - High performance network communication should not involve memory + copies into or out of a kernel-managed protocol stack. + +\layout Standard + +The following are properties of a network architecture that do not impose + scalability limitations for an implementation of MPI: +\layout Itemize + +Receiver-managed - Sender-managed message passing implementations require + a persistent block of memory to be available for every process, requiring + memory resources to increase with job size and requiring user-level flow + control mechanisms to manage these resources. + +\layout Itemize + +User-level Bypass - While OS Bypass is necessary for high-performance, it + alone is not sufficient to support the Progress Rule of MPI asynchronous + operations. + +\layout Itemize + +Unexpected messages - Few communication systems have support for receiving + messages for which there is no prior notification. + Support for these types of messages is necessary to avoid flow control + and protocol overhead. + +\layout Section + +Background +\layout Standard + +Portals was originally designed for and implemented on the nCube machine + as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~ + +\begin_inset LatexCommand \cite{SUNMOS} + +\end_inset + + and Puma\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaOS} + +\end_inset + + lightweight kernel development projects. + Portals went through two design phases, the latter of which is used on + the 4500-node Intel TeraFLOPS machine\SpecialChar ~ + +\begin_inset LatexCommand \cite{TFLOPS} + +\end_inset + +. + Portals have been very successful in meeting the needs of such a large + machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaMPI} + +\end_inset + +, but also for implementing the scalable run-time environment and parallel + I/O capabilities of the machine. +\layout Standard + +The second generation Portals implementation was designed to take full advantage + of the hardware architecture of large MPP machines. + However, efforts to implement this same design on commodity cluster technology + identified several limitations, due to the differences in network hardware + as well as to shortcomings in the design of Portals. +\layout Section + +Scalability +\layout Standard + +The primary goal in the design of Portals is scalability. + Portals are designed specifically for an implementation capable of supporting + a parallel job running on tens of thousands of nodes. + Performance is critical only in terms of scalability. + That is, the level of message passing performance is characterized by how + far it allows an application to scale and not by how it performs in micro-bench +marks (e.g., a two node bandwidth or latency test). +\layout Standard + +The Portals API is designed to allow for scalability, not to guarantee it. + Portals cannot overcome the shortcomings of a poorly designed application + program. + Applications that have inherent scalability limitations, either through + design or implementation, will not be transformed by Portals into scalable + applications. + Scalability must be addressed at all levels. + Portals do not inhibit scalability, but do not guarantee it either. +\layout Standard + +To support scalability, the Portals interface maintains a minimal amount + of state. + Portals provide reliable, ordered delivery of messages between pairs of + processes. + They are connectionless: a process is not required to explicitly establish + a point-to-point connection with another process in order to communicate. + Moreover, all buffers used in the transmission of messages are maintained + in user space. + The target process determines how to respond to incoming messages, and + messages for which there are no buffers are discarded. +\layout Section + +Communication Model +\layout Standard + +Portals combine the characteristics of both one-side and two-sided communication. + They define a +\begin_inset Quotes eld +\end_inset + +matching put +\begin_inset Quotes erd +\end_inset + + operation and a +\begin_inset Quotes eld +\end_inset + +matching get +\begin_inset Quotes erd +\end_inset + + operation. + The destination of a put (or send) is not an explicit address; instead, + each message contains a set of match bits that allow the receiver to determine + where incoming messages should be placed. + This flexibility allows Portals to support both traditional one-sided operation +s and two-sided send/receive operations. +\layout Standard + +Portals allows the target to determine whether incoming messages are acceptable. + A target process can choose to accept message operations from any specific + process or can choose to ignore message operations from any specific process. +\layout Section + +Zero Copy, OS Bypass and Application Bypass +\layout Standard + +In traditional system architectures, network packets arrive at the network + interface card (NIC), are passed through one or more protocol layers in + the operating system, and eventually copied into the address space of the + application. + As network bandwidth began to approach memory copy rates, reduction of + memory copies became a critical concern. + This concern lead to the development of zero-copy message passing protocols + in which message copies are eliminated or pipelined to avoid the loss of + bandwidth. +\layout Standard + +A typical zero-copy protocol has the NIC generate an interrupt for the CPU + when a message arrives from the network. + The interrupt handler then controls the transfer of the incoming message + into the address space of the appropriate application. + The interrupt latency, the time from the initiation of an interrupt until + the interrupt handler is running, is fairly significant. + To avoid this cost, some modern NICs have processors that can be programmed + to implement part of a message passing protocol. + Given a properly designed protocol, it is possible to program the NIC to + control the transfer of incoming messages, without needing to interrupt + the CPU. + Because this strategy does not need to involve the OS on every message + transfer, it is frequently called +\begin_inset Quotes eld +\end_inset + +OS Bypass. +\begin_inset Quotes erd +\end_inset + + ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + +, FM\SpecialChar ~ + +\begin_inset LatexCommand \cite{FM2} + +\end_inset + +, GM\SpecialChar ~ + +\begin_inset LatexCommand \cite{GM} + +\end_inset + +, and Portals are examples of OS Bypass protocols. +\layout Standard + +Many protocols that support OS Bypass still require that the application + actively participate in the protocol to ensure progress. + As an example, the long message protocol of PM requires that the application + receive and reply to a request to put or get a long message. + This complicates the runtime environment, requiring a thread to process + incoming requests, and significantly increases the latency required to + initiate a long message protocol. + The Portals message passing protocol does not require activity on the part + of the application to ensure progress. + We use the term +\begin_inset Quotes eld +\end_inset + +Application Bypass +\begin_inset Quotes erd +\end_inset + + to refer to this aspect of the Portals protocol. +\layout Section + +Faults +\layout Standard + +Given the number of components that we are dealing with and the fact that + we are interested in supporting applications that run for very long times, + failures are inevitable. + The Portals API recognizes that the underlying transport may not be able + to successfully complete an operation once it has been initiated. + This is reflected in the fact that the Portals API reports three types + of events: events indicating the initiation of an operation, events indicating + the successful completion of an operation, and events indicating the unsuccessf +ul completion of an operation. + Every initiation event is eventually followed by a successful completion + event or an unsuccessful completion event. +\layout Standard + +Between the time an operation is started and the time that the operation + completes (successfully or unsuccessfully), any memory associated with + the operation should be considered volatile. + That is, the memory may be changed in unpredictable ways while the operation + is progressing. + Once the operation completes, the memory associated with the operation + will not be subject to further modification (from this operation). + Notice that unsuccessful operations may alter memory in an essentially + unpredictable fashion. +\layout Chapter + +An Overview of the Portals API +\begin_inset LatexCommand \label{sec:apiover} + +\end_inset + + +\layout Standard + +In this section, we give a conceptual overview of the Portals API. + The goal is to provide a context for understanding the detailed description + of the API presented in the next section. +\layout Section + +Data Movement +\begin_inset LatexCommand \label{sec:dmsemantics} + +\end_inset + + +\layout Standard + +A Portal represents an opening in the address space of a process. + Other processes can use a Portal to read (get) or write (put) the memory + associated with the portal. + Every data movement operation involves two processes, the +\series bold +initiator +\series default + and the +\series bold +target +\series default +. + The initiator is the process that initiates the data movement operation. + The target is the process that responds to the operation by either accepting + the data for a put operation, or replying with the data for a get operation. +\layout Standard + +In this discussion, activities attributed to a process may refer to activities + that are actually performed by the process or +\emph on +on behalf of the process +\emph default +. + The inclusiveness of our terminology is important in the context of +\emph on +application bypass +\emph default +. + In particular, when we note that the target sends a reply in the case of + a get operation, it is possible that reply will be generated by another + component in the system, bypassing the application. +\layout Standard + +Figures\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:put} + +\end_inset + + and +\begin_inset LatexCommand \ref{fig:get} + +\end_inset + + present graphical interpretations of the Portal data movement operations: + put and get. + In the case of a put operation, the initiator sends a put request message + containing the data to the target. + The target translates the Portal addressing information in the request + using its local Portal structures. + When the request has been processed, the target optionally sends an acknowledge +ment message. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename put.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Put (Send) +\begin_inset LatexCommand \label{fig:put} + +\end_inset + + +\end_inset + + +\layout Standard + +In the case of a get operation, the initiator sends a get request to the + target. + As with the put operation, the target translates the Portal addressing + information in the request using its local Portal structures. + Once it has translated the Portal addressing information, the target sends + a reply that includes the requested data. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename get.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Get +\begin_inset LatexCommand \label{fig:get} + +\end_inset + + +\end_inset + + +\layout Standard + +We should note that Portal address translations are only performed on nodes + that respond to operations initiated by other nodes. + Acknowledgements and replies to get operations bypass the portals address + translation structures. +\layout Section + +Portal Addressing +\begin_inset LatexCommand \label{subsec:paddress} + +\end_inset + + +\layout Standard + +One-sided data movement models (e.g., shmem\SpecialChar ~ + +\begin_inset LatexCommand \cite{CraySHMEM} + +\end_inset + +, ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, MPI-2\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPI2} + +\end_inset + +) typically use a triple to address memory on a remote node. + This triple consists of a process id, memory buffer id, and offset. + The process id identifies the target process, the memory buffer id specifies + the region of memory to be used for the operation, and the offset specifies + an offset within the memory buffer. +\layout Standard + +In addition to the standard address components (process id, memory buffer + id, and offset), a Portal address includes a set of match bits. + This addressing model is appropriate for supporting one-sided operations + as well as traditional two-sided message passing operations. + Specifically, the Portals API provides the flexibility needed for an efficient + implementation of MPI-1, which defines two-sided operations with one-sided + completion semantics. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:portals} + +\end_inset + + presents a graphical representation of the structures used by a target + in the interpretation of a Portal address. + The process id is used to route the message to the appropriate node and + is not reflected in this diagram. + The memory buffer id, called the +\series bold +portal id +\series default +, is used as an index into the Portal table. + Each element of the Portal table identifies a match list. + Each element of the match list specifies two bit patterns: a set of +\begin_inset Quotes eld +\end_inset + +don't care +\begin_inset Quotes erd +\end_inset + + bits, and a set of +\begin_inset Quotes eld +\end_inset + +must match +\begin_inset Quotes erd +\end_inset + + bits. + In addition to the two sets of match bits, each match list element has + at most one memory descriptor. + Each memory descriptor identifies a memory region and an optional event + queue. + The memory region specifies the memory to be used in the operation and + the event queue is used to record information about these operations. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename portals.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 305pt + lyxheight 106pt +\end_inset + + +\layout Caption + +Portal Addressing Structures +\begin_inset LatexCommand \label{fig:portals} + +\end_inset + + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + + illustrates the steps involved in translating a Portal address, starting + from the first element in a match list. + If the match criteria specified in the match list entry are met and the + memory descriptor list accepts the operation +\begin_inset Foot +collapsed true + +\layout Standard + +Memory descriptors can reject operations because a threshold has been exceeded + or because the memory region does not have sufficient space, see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + +, the operation (put or get) is performed using the memory region specified + in the memory descriptor. + If the memory descriptor specifies that it is to be unlinked when a threshold + has been exceeded, the match list entry is removed from the match list + and the resources associated with the memory descriptor and match list + entry are reclaimed. + Finally, if there is an event queue specified in the memory descriptor, + the operation is logged in the event queue. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename flow_new.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 447pt + lyxheight 282pt +\end_inset + + +\layout Caption + +Portals Address Translation +\begin_inset LatexCommand \label{fig:flow} + +\end_inset + + +\end_inset + + +\layout Standard + +If the match criteria specified in the match list entry are not met, or + there is no memory descriptor associated with the match list entry, or + the memory descriptor associated with the match list entry rejects the + operation, the address translation continues with the next match list entry. + If the end of the match list has been reached, the address translation + is aborted and the incoming requested is discarded. +\layout Section + +Access Control +\layout Standard + +A process can control access to its portals using an access control list. + Each entry in the access control list specifies a process id and a Portal + table index. + The access control list is actually an array of entries. + Each incoming request includes an index into the access control list (i.e., + a +\begin_inset Quotes eld +\end_inset + +cookie +\begin_inset Quotes erd +\end_inset + + or hint). + If the id of the process issuing the request doesn't match the id specified + in the access control list entry or the Portal table index specified in + the request doesn't match the Portal table index specified in the access + control list entry, the request is rejected. + Process identifiers and Portal table indexes may include wild card values + to increase the flexibility of this mechanism. + +\layout Standard + +Two aspects of this design merit further discussion. + First, the model assumes that the information in a message header, the + sender's id in particular, is trustworthy. + In most contexts, we assume that the entity that constructs the header + is trustworthy; however, using cryptographic techniques, we could easily + devise a protocol that would ensure the authenticity of the sender. +\layout Standard + +Second, because the access check is performed by the receiver, it is possible + that a malicious process will generate thousands of messages that will + be denied by the receiver. + This could saturate the network and/or the receiver, resulting in a +\emph on +denial of service +\emph default + attack. + Moving the check to the sender using capabilities, would remove the potential + for this form of attack. + However, the solution introduces the complexities of capability management + (exchange of capabilities, revocation, protections, etc). +\layout Section + +Multi-threaded Applications +\begin_inset LatexCommand \label{sec:threads} + +\end_inset + + +\layout Standard + +The Portals API supports a generic view of multi-threaded applications. + From the perspective of the Portals API, an application program is defined + by a set of processes. + Each process defines a unique address space. + The Portals API defines access to this address space from other processes + (using portals addressing and the data movement operations). + A process may have one or more +\emph on +threads +\emph default + executing in its address space. + +\layout Standard + +With the exception of +\emph on +PtlEQWait +\emph default + every function in the Portals API is non-blocking and atomic with respect + to both other threads and external operations that result from data movement + operations. + While individual operations are atomic, sequences of these operations may + be interleaved between different threads and with external operations. + The Portals API does not provide any mechanisms to control this interleaving. + It is expected that these mechanisms will be provided by the API used to + create threads. +\layout Chapter + +The Portals API +\begin_inset LatexCommand \label{sec:api} + +\end_inset + + +\layout Section + +Naming Conventions +\begin_inset LatexCommand \label{sec:conv} + +\end_inset + + +\layout Standard + +The Portals API defines two types of entities: functions and types. + Function always start with +\emph on +Ptl +\emph default + and use mixed upper and lower case. + When used in the body of this report, function names appear in italic face, + e.g., +\emph on +PtlInit +\emph default +. + The functions associated with an object type will have names that start + with +\emph on +Ptl +\emph default +, followed by the two letter object type code shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + As an example, the function +\emph on +PtlEQAlloc +\emph default + allocates resources for an event queue. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Object Type Codes +\begin_inset LatexCommand \label{tab:objcodes} + +\end_inset + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\newline + +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\emph on +xx +\end_inset + + +\begin_inset Text + +\layout Standard + + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +EQ +\end_inset + + +\begin_inset Text + +\layout Standard + + Event Queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + MD +\end_inset + + +\begin_inset Text + +\layout Standard + + Memory Descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + ME +\end_inset + + +\begin_inset Text + +\layout Standard + + Match list Entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + NI +\end_inset + + +\begin_inset Text + +\layout Standard + + Network Interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Type names use lower case with underscores to separate words. + Each type name starts with +\family typewriter +ptl +\family default +_ and ends with +\family typewriter +_t +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +ptl_match_bits_t +\family default +. +\layout Standard + +Names for constants use upper case with underscores to separate words. + Each constant name starts with +\family typewriter +PTL_ +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +PTL_OK +\family default +. +\layout Section + +Base Types +\layout Standard + +The Portals API defines a variety of base types. + These types represent a simple renaming of the base types provided by the + C programming language. + In most cases these new type names have been introduced to improve type + safety and to avoid issues arising from differences in representation sizes + (e.g., 16-bit or 32-bit integers). +\layout Subsection + +Sizes +\begin_inset LatexCommand \label{sec:size-t} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_size_t +\family default + is an unsigned 64-bit integral type used for representing sizes. +\layout Subsection + +Handles +\begin_inset LatexCommand \label{sec:handle-type} + +\end_inset + + +\layout Standard + +Objects maintained by the API are accessed through handles. + Handle types have names of the form +\family typewriter +ptl_handle_ +\emph on +xx +\emph default +_t +\family default +, where +\emph on +xx +\emph default + is one of the two letter object type codes shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + For example, the type +\family typewriter +ptl_handle_ni_t +\family default + is used for network interface handles. +\layout Standard + +Each type of object is given a unique handle type to enhance type checking. + The type, +\family typewriter +ptl_handle_any_t +\family default +, can be used when a generic handle is needed. + Every handle value can be converted into a value of type +\family typewriter +ptl_handle_any_t +\family default + without loss of information. +\layout Standard + +Handles are not simple values. + Every portals object is associated with a specific network interface and + an identifier for this interface (along with an object identifier) is part + of the handle for the object. +\layout Standard + +The special value +\family typewriter +PTL_EQ_NONE +\family default +, of type +\family typewriter +ptl_handle_eq_t +\family default +, is used to indicate the absence of an event queue. + See sections +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + for uses of this value. +\layout Subsection + +Indexes +\begin_inset LatexCommand \label{sec:index-type} + +\end_inset + + +\layout Standard + +The types +\family typewriter +ptl_pt_index_t +\family default + and +\family typewriter +ptl_ac_index_t +\family default + are integral types used for representing Portal table indexes and access + control tables indexes, respectively. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + + for limits on values of these types. +\layout Subsection + +Match Bits +\begin_inset LatexCommand \label{sec:mb-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_match_bits_t +\family default + is capable of holding unsigned 64-bit integer values. +\layout Subsection + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_interface_t +\family default + is an integral type used for identifying different network interfaces. + Users will need to consult the local documentation to determine appropriate + values for the interfaces available. + The special value +\family typewriter +PTL_IFACE_DEFAULT +\family default + identifies the default interface. +\layout Subsection + +Identifiers +\begin_inset LatexCommand \label{sec:id-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_nid_t +\family default + is an integral type used for representing node ids +\family typewriter +, ptl_pid_t +\family default + is an integral type for representing process ids, and +\family typewriter +ptl_uid_t +\family default +is an integral type for representing user ids. +\layout Standard + +The special values +\family typewriter +PTL_PID_ANY +\family default + matches any process identifier, PTL_NID_ANY matches any node identifier, + and +\family typewriter +PTL_UID_ANY +\family default + matches any user identifier. + See sections +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + for uses of these values. +\layout Subsection + +Status Registers +\begin_inset LatexCommand \label{sec:stat-type} + +\end_inset + + +\layout Standard + +Each network interface maintains an array of status registers that can be + accessed using the +\family typewriter +PtlNIStatus +\family default + function (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + The type +\family typewriter +ptl_sr_index_t +\family default + defines the types of indexes that can be used to access the status registers. + The only index defined for all implementations is +\family typewriter +PTL_SR_DROP_COUNT +\family default + which identifies the status register that counts the dropped requests for + the interface. + Other indexes (and registers) may be defined by the implementation. +\layout Standard + +The type +\family typewriter +ptl_sr_value_t +\family default + defines the types of values held in status registers. + This is a signed integer type. + The size is implementation dependent, but must be at least 32 bits. +\layout Section + +Initialization and Cleanup +\begin_inset LatexCommand \label{sec:init} + +\end_inset + + +\layout Standard + +The Portals API includes a function, +\emph on +PtlInit +\emph default +, to initialize the library and a function, +\emph on +PtlFini +\emph default +, to cleanup after the application is done using the library. +\layout Subsection + +PtlInit +\begin_inset LatexCommand \label{sec:ptlinit} + +\end_inset + + +\layout LyX-Code + +int PtlInit( int *max_interfaces ); +\layout Standard +\noindent +The +\emph on +PtlInit +\emph default + function initializes the Portals library. + PtlInit must be called at least once by a process before any thread makes + a Portals function call, but may be safely called more than once. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_FAIL Indicates an error during initialization. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +max_interfaces +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +max_interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the maximum number of interfaces + that can be initialized. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlFini +\begin_inset LatexCommand \label{sec:ptlfini} + +\end_inset + + +\layout LyX-Code + +void PtlFini( void ); +\layout Standard +\noindent +The +\emph on +PtlFini +\emph default + function cleans up after the Portals library is no longer needed by a process. + After this function is called, calls to any of the functions defined by + the Portal API or use of the structures set up by the Portals API will + result in undefined behavior. + This function should be called once and only once during termination by + a process. + Typically, this function will be called in the exit sequence of a process. + Individual threads should not call PtlFini when they terminate. +\layout Section + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni} + +\end_inset + + +\layout Standard + +The Portals API supports the use of multiple network interfaces. + However, each interface is treated as an independent entity. + Combining interfaces (e.g., +\begin_inset Quotes eld +\end_inset + +bonding +\begin_inset Quotes erd +\end_inset + + to create a higher bandwidth connection) must be implemented by the application + or embedded in the underlying network. + Interfaces are treated as independent entities to make it easier to cache + information on individual network interface cards. +\layout Standard + +Once initialized, each interface provides a Portal table, an access control + table, and a collection of status registers. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for a discussion of updating Portal table entries using the +\emph on +PtlMEAttach +\emph default + function. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + for a discussion of the initialization and updating of entries in the access + control table. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + for a discussion of the +\emph on +PtlNIStatus +\emph default + function which can be used to determine the value of a status register. +\layout Standard + +Every other type of Portal object (e.g., memory descriptor, event queue, or + match list entry) is associated with a specific network interface. + The association to a network interface is established when the object is + created and is encoded in the handle for the object. +\layout Standard + +Each network interface is initialized and shutdown independently. + The initialization routine, +\emph on +PtlNIInit +\emph default +, returns a handle for an interface object which is used in all subsequent + Portal operations. + The +\emph on +PtlNIFini +\emph default + function is used to shutdown an interface and release any resources that + are associated with the interface. + Network interface handles are associated with processes, not threads. + All threads in a process share all of the network interface handles. +\layout Standard + +The Portals API also defines the +\emph on +PtlNIStatus +\emph default + function to query the status registers for a network interface, the +\emph on +PtlNIDist +\emph default + function to determine the +\begin_inset Quotes eld +\end_inset + +distance +\begin_inset Quotes erd +\end_inset + + to another process, and the +\emph on +PtlNIHandle +\emph default + function to determine the network interface that an object is associated + with. +\layout Subsection + +PtlNIInit +\begin_inset LatexCommand \label{sec:niinit} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + int max_match_entries; +\newline + int max_mem_descriptors; +\newline + int max_event_queues; +\newline + ptl_ac_index_t max_atable_index; +\newline + ptl_pt_index_t max_ptable_index; +\newline +} ptl_ni_limits_t; +\newline + +\newline +int PtlNIInit( ptl_interface_t interface +\newline + ptl_pid_t pid, +\newline + ptl_ni_limits_t* desired, +\newline + ptl_ni_limits_t* actual, +\newline + ptl_handle_ni_t* handle ); +\layout Standard + +Values of type +\family typewriter +ptl_ni_limits_t +\family default + include the following members: +\layout Description + +max_match_entries Maximum number of match entries that can be allocated + at any one time. +\layout Description + +max_mem_descriptors Maximum number of memory descriptors that can be allocated + at any one time. +\layout Description + +max_event_queues Maximum number of event queues that can be allocated at + any one time. +\layout Description + +max_atable_index Largest access control table index for this interface, + valid indexes range from zero to +\family typewriter +max_atable_index +\family default +, inclusive. +\layout Description + +max_ptable_index Largest Portal table index for this interface, valid indexes + range from zero to +\family typewriter +max_ptable_index +\family default +, inclusive. +\layout Standard +\noindent +The +\emph on +PtlNIInit +\emph default + function is used to initialized the Portals API for a network interface. + This function must be called at least once by each process before any other + operations that apply to the interface by any process or thread. + For subsequent calls to +\shape italic +PtlNIInit +\shape default + from within the same process (either by different threads or the same thread), + the desired limits will be ignored and the call will return the existing + NI handle. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INIT_DUP Indicates a duplicate initialization of +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INIT_INV Indicates that +\family typewriter +interface +\family default + is not a valid network interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to initialize the + interface. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +pid +\family default + is not a valid process id. +\layout Description + +PTL_SEGV Indicates that +\family typewriter +actual +\family default +or +\family typewriter + handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the network interface to be initialized. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + for a discussion of values used to identify network interfaces.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +pid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the desired process id (for well known process ids). + The value +\family typewriter +PTL_PID_ANY +\family default + may be used to have the process id assigned by the underlying library. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +desired +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If non-NULL, points to a structure that holds the desired limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +actual +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, the location pointed to by actual will hold the actual + limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the interface. +\end_inset + + + + +\end_inset + + +\layout Comment + +The use of desired is implementation dependent. + In particular, an implementation may choose to ignore this argument. +\layout Subsection + +PtlNIFini +\begin_inset LatexCommand \label{sec:nifini} + +\end_inset + + +\layout LyX-Code + +int PtlNIFini( ptl_handle_ni_t interface ); +\layout Standard +\noindent +The +\emph on +PtlNIFini +\emph default + function is used to release the resources allocated for a network interface. + Once the +\emph on +PtlNIFini +\emph default + operation has been started, the results of pending API operations (e.g., + operations initiated by another thread) for this interface are undefined. + Similarly, the effects of incoming operations (puts and gets) or return + values (acknowledgements and replies) for this interface are undefined. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the interface to shutdown. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlNIStatus +\begin_inset LatexCommand \label{sec:nistatus} + +\end_inset + + +\layout LyX-Code + +int PtlNIStatus( ptl_handle_ni_t interface, +\newline + ptl_sr_index_t status_register, +\newline + ptl_sr_value_t* status ); +\layout Standard +\noindent +The +\emph on +PtlNIStatus +\emph default + function returns the value of a status register for the specified interface. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + for more information on status register indexes and status register values.) +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_SR_INDX Indicates that +\family typewriter +status_register +\family default + is not a valid status register. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +status +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status_register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An index for the status register to read. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the current value of the status + register. +\end_inset + + + + +\end_inset + + +\layout Comment + +The only status register that must be defined is a drop count register ( +\family typewriter +PTL_SR_DROP_COUNT +\family default +). + Implementations may define additional status registers. + Identifiers for the indexes associated with these registers should start + with the prefix +\family typewriter +PTL_SR_ +\family default +. +\layout Subsection + +PtlNIDist +\layout LyX-Code + +int PtlNIDist( ptl_handle_ni_t interface, +\newline + ptl_process_id_t process, +\newline + unsigned long* distance ); +\layout Standard +\noindent +The +\emph on +PtlNIDist +\emph default + function returns the distance to another process using the specified interface. + Distances are only defined relative to an interface. + Distance comparisons between different interfaces on the same process may + be meaningless. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +process +\family default + is not a valid process identifier. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +distance +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An identifier for the process whose distance is being requested. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +distance +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the distance to the remote + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +This function should return a static measure of distance. + Examples include minimum latency, the inverse of available bandwidth, or + the number of switches between the two endpoints. +\layout Subsection + +PtlNIHandle +\layout LyX-Code + +int PtlNIHandle( ptl_handle_any_t handle, +\newline + ptl_handle_ni_t* interface ); +\layout Standard +\noindent +The +\emph on +PtlNIHandle +\emph default + function returns a handle for the network interface with which the object + identified by +\family typewriter +handle +\family default + is associated. + If the object identified by +\family typewriter +handle +\family default + is a network interface, this function returns the same value it is passed. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_HANDLE Indicates that +\family typewriter +handle +\family default + is not a valid handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the object. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the network interface + associated with +\family typewriter +handle +\family default +. +\end_inset + + + + +\end_inset + + +\layout Comment + +Every handle should encode the network interface and the object id relative + to this handle. + Both are presumably encoded using integer values. +\layout Section + +User Identification +\begin_inset LatexCommand \label{sec:uid} + +\end_inset + + +\layout Standard + +Every process runs on behalf of a user. + +\layout Subsection + +PtlGetUid +\layout LyX-Code + +int PtlGetUid( ptl_handle_ni_t ni_handle, +\newline + ptl_uid_t* uid ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the user id for the calling + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that user identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, a process may have multiple + user identifiers. +\layout Section + +Process Identification +\begin_inset LatexCommand \label{sec:pid} + +\end_inset + + +\layout Standard + +Processes that use the Portals API, can be identified using a node id and + process id. + Every node accessible through a network interface has a unique node identifier + and every process running on a node has a unique process identifier. + As such, any process in the computing system can be identified by its node + id and process id. + +\layout Standard + +The Portals API defines a type, +\family typewriter +ptl_process_id_t +\family default + for representing process ids and a function, +\emph on +PtlGetId +\emph default +, which can be used to obtain the id of the current process. +\layout Comment + +The portals API does not include thread identifiers. + Messages are delivered to processes (address spaces) not threads (contexts + of execution). +\layout Subsection + +The Process Id Type +\begin_inset LatexCommand \label{sec:pid-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_nid_t nid; /* node id */ +\newline + ptl_pid_t pid; /* process id */ +\newline +} ptl_process_id_t; +\layout Standard +\noindent +The +\family typewriter +ptl_process_id_t +\family default + type uses two identifiers to represent a process id: a node id and a process + id. + +\layout Subsection + +PtlGetId +\begin_inset LatexCommand \label{sub:PtlGetId} + +\end_inset + + +\layout LyX-Code + +int PtlGetId( ptl_handle_ni_t ni_handle, +\newline + ptl_process_id_t* id ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +id +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the id for the calling process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that process identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, it may have multiple + node identifiers. +\layout Section + +Match List Entries and Match Lists +\begin_inset LatexCommand \label{sec:me} + +\end_inset + + +\layout Standard + +A match list is a chain of match list entries. + Each match list entry includes a memory descriptor and a set of match criteria. + The match criteria can be used to reject incoming requests based on process + id or the match bits provided in the request. + A match list is created using the +\emph on +PtlMEAttach +\emph default + or +\shape italic +PtlMEAttachAny +\shape default + functions, which create a match list consisting of a single match list + entry, attaches the match list to the specified Portal index, and returns + a handle for the match list entry. + Match entries can be dynamically inserted and removed from a match list + using the +\emph on +PtlMEInsert +\emph default + and +\emph on +PtlMEUnlink +\emph default + functions. +\layout Subsection + +PtlMEAttach +\begin_inset LatexCommand \label{sec:meattach} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t; +\newline + +\layout LyX-Code + +typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t; +\newline + +\layout LyX-Code + +int PtlMEAttach( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ins_pos_t +\family default + are used to control where a new item is inserted. + The value +\family typewriter +PTL_INS_BEFORE +\family default + is used to insert the new item before the current item or before the head + of the list. + The value +\family typewriter +PTL_INS_AFTER +\family default + is used to insert the new item after the current item or after the last + item in the list. + +\layout Standard + +The +\emph on +PtlMEAttach +\emph default + function creates a match list consisting of a single entry and attaches + this list to the Portal table for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PTINDEX Indicates that +\family typewriter +index +\family default + is not a valid Portal table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The Portal table index where the match list should be attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specifies the match criteria for the process id of the requestor. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +match_bits, ignorebits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specify the match criteria to apply to the match bits in the incoming request. + The +\family typewriter +ignorebits +\family default + are used to mask out insignificant bits in the incoming match bits. + The resulting bits are then compared to the match list entry's match + bits to determine if the incoming request meets the match criteria. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates the match list entry should be unlinked when the last memory descripto +r associated with this match list entry is unlinked. + (Note, the check for unlinking a match entry only occurs when a memory + descriptor is unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be prepended or appended to + the existing match list. + If there is no existing list, this argument is ignored and the new match + entry becomes the only entry in the list. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEAttachAny +\begin_inset LatexCommand \label{sec:attachany} + +\end_inset + + +\layout LyX-Code + +int PtlMEAttachAny( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t *index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEAttachAny +\emph default + function creates a match list consisting of a single entry and attaches + this list to an unused Portal table entry for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_PT_FULL Indicates that there are no free entries in the Portal table. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On succesfful return, this location will hold the Portal index where the + match list has been attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid, match_bits, ignorebits, unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\shape italic +PtlMEAttach +\shape default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEInsert +\begin_inset LatexCommand \label{sec:meinsert} + +\end_inset + + +\layout LyX-Code + +int PtlMEInsert( ptl_handle_me_t current, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEInsert +\emph default + function creates a new match list entry and inserts this entry into the + match list containing +\family typewriter +current +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +current +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match entry. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +current +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for a match entry. + The new match entry will be inserted immediately before or immediately + after this match entry. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\family default +, +\family typewriter +match_bits +\family default +, +\family typewriter +ignorebits +\family default +, +\family typewriter +unlink +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be inserted before or after + the +\family typewriter +current +\family default + entry. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default +. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEUnlink +\begin_inset LatexCommand \label{sec:meunlink} + +\end_inset + + +\layout LyX-Code + +int PtlMEUnlink( ptl_handle_me_t entry ); +\layout Standard +\noindent +The +\emph on +PtlMEUnlink +\emph default + function can be used to unlink a match entry from a match list. + This operation also releases any resources associated with the match entry + (including the associated memory descriptor). + It is an error to use the match entry handle after calling +\emph on +PtlMEUnlink +\emph default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +entry +\family default + is not a valid match entry handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the match entry to be unlinked. +\end_inset + + + + +\end_inset + + +\layout Section + +Memory Descriptors +\begin_inset LatexCommand \label{sec:md} + +\end_inset + + +\layout Standard + +A memory descriptor contains information about a region of an application + process' memory and an event queue where information about the operations + performed on the memory descriptor are recorded. + The Portals API provides two operations to create memory descriptors: +\emph on +PtlMDAttach +\emph default +, and +\emph on +PtlMDBind +\emph default +; an operation to update a memory descriptor, +\emph on +PtlMDUpdate +\emph default +; and an operation to unlink and release the resources associated with a + memory descriptor, +\emph on +PtlMDUnlink +\emph default +. +\layout Subsection + +The Memory Descriptor Type +\begin_inset LatexCommand \label{sec:md-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + void* start; +\newline + ptl_size_t length; +\newline + int threshold; +\newline + unsigned int max_offset; +\newline + unsigned int options; +\newline + void* user_ptr; +\newline + ptl_handle_eq_t eventq; +\newline +} ptl_md_t; +\layout Standard +\noindent +The +\family typewriter +ptl_md_t +\family default + type defines the application view of a memory descriptor. + Values of this type are used to initialize and update the memory descriptors. +\layout Subsubsection + +Members +\layout Description + +start,\SpecialChar ~ +length Specify the memory region associated with the memory descriptor. + The +\family typewriter +start +\family default + member specifies the starting address for the memory region and the +\family typewriter +length +\family default + member specifies the length of the region. + The +\family typewriter +start member +\family default + can be NULL provided that the +\family typewriter +length +\family default + member is zero. + (Zero length buffers are useful to record events.) There are no alignment + restrictions on the starting address or the length of the region; although, + unaligned messages may be slower (i.e., lower bandwidth and/or longer latency) + on some implementations. + +\layout Description + +threshold Specifies the maximum number of operations that can be performed + on the memory descriptor. + An operation is any action that could possibly generate an event (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + for the different types of events). + In the usual case, the threshold value is decremented for each operation + on the memory descriptor. + When the threshold value is zero, the memory descriptor is +\emph on +inactive +\emph default +, and does not respond to operations. + A memory descriptor can have an initial threshold value of zero to allow + for manipulation of an inactive memory descriptor by the local process. + A threshold value of +\family typewriter +PTL_MD_THRESH_INF +\family default + indicates that there is no bound on the number of operations that may be + applied to a memory descriptor. + Note that local operations (e.g., +\emph on +PtlMDUpdate +\emph default +) are not applied to the threshold count. + +\layout Description + +max_offset Specifies the maximum local offset of a memory descriptor. + When the local offset of a memory descriptor exceeds this maximum, the + memory descriptor becomes +\shape italic +inactive +\shape default + and does not respond to further operations. +\layout Description + +options Specifies the behavior of the memory descriptor. + There are five options that can be selected: enable put operations (yes + or no), enable get operations (yes or no), offset management (local or + remote), message truncation (yes or no), and acknowledgement (yes or no). + Values for this argument can be constructed using a bitwise or of the following + values: +\begin_deeper +\begin_deeper +\layout Description + +PTL_MD_OP_PUT Specifies that the memory descriptor will respond to +\emph on +put +\emph default + operations. + By default, memory descriptors reject +\emph on +put +\emph default + operations. + +\layout Description + +PTL_MD_OP_GET Specifies that the memory descriptor will respond to +\emph on +get +\emph default + operations. + By default, memory descriptors reject +\emph on +get +\emph default + operations. + +\layout Description + +PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory + region is provided by the incoming request. + By default, the offset is maintained locally. + When the offset is maintained locally, the offset is incremented by the + length of the request so that the next operation (put and/or get) will + access the next part of the memory region. +\layout Description + +PTL_MD_TRUNCATE Specifies that the length provided in the incoming request + can be reduced to match the memory available in the region. + (The memory available in a memory region is determined by subtracting the + offset from the length of the memory region.) By default, if the length + in the incoming operation is greater than the amount of memory available, + the operation is rejected. + +\layout Description + +PTL_MD_ACK_DISABLE Specifies that an acknowledgement should +\emph on +not +\emph default + be sent for incoming +\emph on +put +\emph default + operations, even if requested. + By default, acknowledgements are sent for +\emph on +put +\emph default + operations that request an acknowledgement. + Acknowledgements are never sent for +\emph on +get +\emph default + operations. + The value sent in the reply serves as an implicit acknowledgement. + +\end_deeper +\layout Standard + + +\series bold +Note +\series default +: It is not considered an error to have a memory descriptor that does not + respond to either +\emph on +put +\emph default + or +\emph on +get +\emph default + operations: Every memory descriptor responds to +\emph on +reply +\emph default + operations. + Nor is it considered an error to have a memory descriptor that responds + to both +\emph on +put +\emph default + and +\emph on +get +\emph default + operations. + +\end_deeper +\layout Description + +user_ptr A user-specified value that is associated with the memory descriptor. + The value does not need to be a pointer, but must fit in the space used + by a pointer. + This value (along with other values) is recorded in events associated with + operations on this memory descriptor. +\begin_inset Foot +collapsed true + +\layout Standard + +Tying the memory descriptor to a user-defined value can be useful when multiple + memory descriptor share the same event queue or when the memory descriptor + needs to be associated with a data structure maintained by the application. + For example, an MPI implementation can set the +\family typewriter +user_ptr +\family default + argument to the value of an MPI Request. + This direct association allows for processing of memory descriptor's by + the MPI implementation without a table lookup or a search for the appropriate + MPI Request. +\end_inset + + +\layout Description + +eventq A handle for the event queue used to log the operations performed + on the memory region. + If this argument is +\family typewriter +PTl_EQ_NONE +\family default +, operations performed on this memory descriptor are not logged. + +\layout Subsection + +PtlMDAttach +\begin_inset LatexCommand \label{sec:mdattach} + +\end_inset + + +\layout LyX-Code + +int PtlMDAttach( ptl_handle_me_t match, +\newline + ptl_md_t mem_desc, +\newline + ptl_unlink_t unlink_op, +\newline + ptl_unlink_t unlink_nofit, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_unlink_t +\family default + are used to control whether an item is unlinked from a list. + The value +\family typewriter +PTL_UNLINK +\family default + enables unlinking. + The value +\family typewriter +PTL_RETAIN +\family default + disables unlinking. +\layout Standard + +The +\emph on +PtlMDAttach +\emph default + operation is used to create a memory descriptor and attach it to a match + list entry. + An error code is returned if this match list entry already has an associated + memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INUSE Indicates that +\family typewriter +match +\family default + already has a memory descriptor attached. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +match +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface associated with +\family typewriter +match +\family default +. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the match entry that the memory descriptor will be associated + with. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_op +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when it becomes + inactive, either because the operation threshold drops to zero or because + the maximum offset has been exceeded. + (Note, the check for unlinking a memory descriptor only occurs after a + the completion of a successful operation. + If the threshold is set to zero during initialization or using +\emph on +PtlMDUpdate +\emph default +, the memory descriptor is +\series bold +not +\series default + unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_nofit +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when the space + remaining in the memory descriptor is not sufficient for a matching operation. + If an incoming message arrives arrives at a memory descriptor that does + not have sufficient space and the +\series bold +PTL_MD_TRUNCATE +\series default + operation is not specified, the memory descriptor will be unlinked. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument can be NULL, in which case the handle will not be returned. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDBind +\begin_inset LatexCommand \label{sec:mdbind} + +\end_inset + + +\layout LyX-Code + +int PtlMDBind( ptl_handle_ni_t interface, +\newline + ptl_md_t mem_desc, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlMDBind +\emph default + operation is used to create a +\begin_inset Quotes eld +\end_inset + +free floating +\begin_inset Quotes erd +\end_inset + + memory descriptor, i.e., a memory descriptor that is not associated with + a match list entry. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface, +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INV_EQ Indicates that the event queue associated with +\family typewriter +mem_desc +\family default + is not valid. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the network interface with which the memory descriptor will + be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument must be a valid address and cannot be NULL. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUnlink +\begin_inset LatexCommand \label{sec:mdfree} + +\end_inset + + +\layout LyX-Code + +int PtlMDUnlink( ptl_handle_md_t mem_desc ); +\layout Standard +\noindent +The +\emph on +PtlMDUnlink +\emph default + function unlinks the memory descriptor from any match list entry it may + be linked to and releases the resources associated with a memory descriptor. + (This function does not free the memory region associated with the memory + descriptor.) This function also releases the resources associated with a + floating memory descriptor. + Only memory descriptors with no pending operations may be unlinked. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. +\layout Description + +PTL_MD_INUSE Indicates that +\family typewriter +mem_desc +\family default + has pending operations and cannot be unlinked. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUpdate +\begin_inset LatexCommand \label{sec:mdupdate} + +\end_inset + + +\layout LyX-Code + +int PtlMDUpdate( ptl_handle_md_t mem_desc, +\newline + ptl_md_t* old_md, +\newline + ptl_md_t* new_md, +\newline + ptl_handle_eq_t testq ); +\layout Standard +\noindent +The +\emph on +PtlMDUpdate +\emph default + function provides a conditional, atomic update operation for memory descriptors. + The memory descriptor identified by +\family typewriter +mem_desc +\family default + is only updated if the event queue identified by +\family typewriter +testq +\family default + is empty. + The intent is to only enable updates to the memory descriptor when no new + messages have arrived since the last time the queue was checked. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + + for an example of how this function can be used. +\layout Standard + +If +\family typewriter +new +\family default + is not NULL the memory descriptor identified by handle will be updated + to reflect the values in the structure pointed to by +\family typewriter +new +\family default + if +\family typewriter +testq +\family default + has the value +\family typewriter +PTL_EQ_NONE +\family default + or if the event queue identified by +\family typewriter +testq +\family default + is empty. + If +\family typewriter +old +\family default + is not NULL, the current value of the memory descriptor identified by +\family typewriter +mem_desc +\family default + is recorded in the location identified by +\family typewriter +old +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_NOUPDATE Indicates that the update was not performed because +\family typewriter +testq +\family default + was not empty. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. + +\layout Description + +PTL_ILL_MD Indicates that the value pointed to by +\family typewriter +new +\family default + is not a legal memory descriptor (e.g., the memory region specified by the + memory descriptor may be invalid). + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +testq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +new +\family default + or +\family typewriter +old +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +old_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +old_md +\family default + is not the value +\family typewriter +NULL +\family default +, the current value of the memory descriptor will be stored in the location + identified by +\family typewriter +old +\family default +_md. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +new_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +new_md +\family default + is not the value +\family typewriter +NULL +\family default +, this argument provides the new values for the memory descriptor, if the + update is performed. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +testq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for an event queue used to predicate the update. + If +\family typewriter +testq +\family default + is equal to +\family typewriter +PTL_EQ_NONE +\family default +, the update is performed unconditionally. + Otherwise, the update is performed if and only if +\family typewriter +testq +\family default + is empty. + If the update is not performed, the function returns the value +\family typewriter +PTL_NOUPDATE +\family default +. + (Note, the +\family typewriter +testq +\family default + argument does not need to be the same as the event queue associated with + the memory descriptor.) +\end_inset + + + + +\end_inset + + +\layout Standard + +The conditional update can be used to ensure that the memory descriptor + has not changed between the time it was examined and the time it is updated. + In particular, it is needed to support an MPI implementation where the + activity of searching an unexpected message queue and posting a receive + must be atomic. +\layout Section + +Events and Event Queues +\begin_inset LatexCommand \label{sec:eq} + +\end_inset + + +\layout Standard + +Event queues are used to log operations performed on memory descriptors. + They can also be used to hold acknowledgements for completed +\emph on +put +\emph default + operations and to note when the data specified in a +\emph on +put +\emph default + operation has been sent (i.e., when it is safe to reuse the buffer that holds + this data). + Multiple memory descriptors can share a single event queue. +\layout Standard + +In addition to the +\family typewriter +ptl_handle_eq_t +\family default + type, the Portals API defines two types associated with events: The +\family typewriter + +\newline +ptl_event_kind_t +\family default + type defines the kinds of events that can be stored in an event queue. + The +\family typewriter +ptl_event_t +\family default + type defines a structure that holds the information associated with an + event. +\layout Standard + +The Portals API also provides four functions for dealing with event queues: + The +\emph on +PtlEQAlloc +\emph default + function is used to allocate the API resources needed for an event queue, + the +\emph on +PtlEQFree +\emph default + function is used to release these resources, the +\emph on +PtlEQGet +\emph default + function can be used to get the next event from an event queue, and the + +\emph on +PtlEQWait +\emph default + function can be used to block a process (or thread) until an event queue + has at least one event. +\layout Subsection + +Kinds of Events +\begin_inset LatexCommand \label{sec:ek-type} + +\end_inset + + +\layout LyX-Code + +typedef enum { +\newline + PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL, +\newline + PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL, +\newline + PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL, +\newline + PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL, +\newline + PTL_EVENT_ACK, +\newline + PTL_EVENT_UNLINK +\newline +} ptl_event_kind_t; +\layout Standard +\noindent +The Portals API defines fourteen types of events that can be logged in an + event queue: +\layout Description + +PTL_EVENT_GET_START A remote +\emph on +get +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_GET_END A previously initiated +\emph on +get +\emph default + operation completed successfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_GET_FAIL A previously initiated +\emph on +get +\emph default + operation completed unsuccessfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_PUT_START A remote +\emph on +put +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should should be considered + volatile until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_PUT_END A previously initiated +\emph on +put +\emph default + operation completed successfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_PUT_FAIL A previously initiated +\emph on +put +\emph default + operation completed unsuccessfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_REPLY_START A +\emph on +reply +\emph default + operation has been started on the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_END A previously initiated +\emph on +reply +\emph default + operation has completed successfully . + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_FAIL A previously initiated +\emph on +reply +\emph default + operation has completed unsuccessfully. + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_ACK An +\emph on +acknowledgement +\emph default + was received. + This event is logged when the acknowledgement is received +\layout Description + +PTL_EVENT_SEND_START An outgoing +\emph on +send +\emph default + operation has been started. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_SEND_END A previously initiated +\emph on +send +\emph default + operation has completed successfully. + This event is logged after the entire buffer has been sent and it is safe + for the application to reuse the buffer. + +\layout Description + +PTL_EVENT_SEND_FAIL A previously initiated +\emph on +send +\emph default + operation has completed unsuccessfully. + The process can safely manipulate the memory or free the memory descriptor + once it sees this event. +\layout Description + +PTL_EVENT_UNLINK A memory descriptor associated with this event queue has + been automatically unlinked. + This event is not generated when a memory descriptor is explicitly unlinked + by calling +\shape italic +PtlMDUnlink +\shape default +. + This event does not decrement the threshold count. +\layout Subsection + +Event Ordering +\layout Standard + +The Portals API guarantees that a when a process initiates two operations + on a remote process, the operations will be initiated on the remote process + in the same order that they were initiated on the original process. + As an example, if process A intitates two +\emph on +put +\emph default + operations, +\emph on +x +\emph default + and +\emph on +y +\emph default +, on process B, the Portals API guarantees that process A will receive the + +\family typewriter +PTL_EVENT_SEND_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default + in the same order that process B receives the +\family typewriter +PTL_EVENT_PUT_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default +. + Notice that the API does not guarantee that the start events will be delivered + in the same order that process A initiated the +\emph on +x +\emph default + and +\emph on +y +\emph default + operations. + If process A needs to ensure the ordering of these operations, it should + include code to wait for the initiation of +\emph on +x +\emph default + before it initiates +\emph on +y +\emph default +. +\layout Subsection + +Failure Notification +\layout Standard + +Operations may fail to complete successfully; however, unless the node itself + fails, every operation that is started will eventually complete. + While an operation is in progress, the memory associated with the operation + should not be viewed (in the case of a put or a reply) or altered (in the + case of a send or get). + Operation completion, whether successful or unsuccessful, is final. + That is, when an operation completes, the memory associated with the operation + will no longer be read or altered by the operation. + A network interface can use the +\family typewriter +ptl_ni_fail_t +\family default + to define more specific information regarding the failure of the operation + and record this information in the +\family typewriter +ni_fail_type +\family default + field of the event. +\layout Subsection + +The Event Type +\begin_inset LatexCommand \label{sec:event-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_event_kind_t type; +\newline + ptl_process_id_t initiator; +\newline + ptl_uid_t uid; +\layout LyX-Code + + ptl_pt_index_t portal; +\newline + ptl_match_bits_t match_bits; +\newline + ptl_size_t rlength; +\newline + ptl_size_t mlength; +\newline + ptl_size_t offset; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_hdr_data_t hdr_data; +\newline + ptl_seq_t link; +\newline + ptl_ni_fail_t ni_fail_type; +\newline + volatile ptl_seq_t sequence; +\newline +} ptl_event_t; +\layout Standard +\noindent +An event structure includes the following members: +\layout Description + +type Indicates the type of the event. + +\layout Description + +initiator The id of the initiator. + +\layout Description + +portal The Portal table index specified in the request. + +\layout Description + +match_bits A copy of the match bits specified in the request. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for more information on match bits. + +\layout Description + +rlength The length (in bytes) specified in the request. + +\layout Description + +mlength The length (in bytes) of the data that was manipulated by the operation. + For truncated operations, the manipulated length will be the number of + bytes specified by the memory descriptor (possibly with an offset) operation. + For all other operations, the manipulated length will be the length of + the requested operation. + +\layout Description + +offset Is the displacement (in bytes) into the memory region that the operation + used. + The offset can be determined by the operation (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + +) for a remote managed memory descriptor, or by the local memory descriptor + (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +). + +\layout Description + +md_handle Is the handle to the memory descriptor associated with the event. +\layout Description + +mem_desc Is the state of the memory descriptor immediately after the event + has been processed. + +\layout Description + +hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +). + +\layout Description + +link The +\emph on +link +\emph default + member is used to link +\family typewriter +START +\family default + events with the +\family typewriter +END +\family default + or +\family typewriter +FAIL +\family default + event that signifies completion of the operation. + The +\emph on +link +\emph default + member will be the same for the two events associated with an operation. + The link member is also used to link an +\family typewriter +UNLINK +\family default + event with the event that caused the memory descriptor to be unlinked. +\layout Description + +sequence The sequence number for this event. + Sequence numbers are unique to each event. +\layout Comment + +The +\emph on +sequence +\emph default + member is the last member and is volatile to support SMP implementations. + When an event structure is filled in, the +\emph on +sequence +\emph default + member should be written after all other members have been updated. + Moreover, a memory barrier should be inserted between the updating of other + members and the updating of the +\emph on +sequence +\emph default + member. +\layout Subsection + +PtlEQAlloc +\begin_inset LatexCommand \label{sec:eqalloc} + +\end_inset + + +\layout LyX-Code + +int PtlEQAlloc( ptl_handle_ni_t interface, +\newline + ptl_size_t count, +\newline + ptl_handle_eq_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlEQAlloc +\emph default + function is used to build an event queue. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + event queue. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface with which the event queue will be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +count +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The number of events that can be stored in the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQFree +\begin_inset LatexCommand \label{sec:eqfree} + +\end_inset + + +\layout LyX-Code + +int PtlEQFree( ptl_handle_eq_t eventq ); +\layout Standard +\noindent +The +\emph on +PtlEQFree +\emph default + function releases the resources associated with an event queue. + It is up to the user to insure that no memory descriptors are associated + with the event queue once it is freed. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the event queue to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQGet +\begin_inset LatexCommand \label{sec:eqget} + +\end_inset + + +\layout LyX-Code + +int PtlEQGet( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQGet +\emph default + function is a nonblocking function that can be used to get the next event + in an event queue. + The event is removed from the queue. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_EQ_EMPTY Indicates that +\family typewriter +eventq +\family default + is empty or another thread is waiting on +\emph on +PtlEQWait +\emph default +. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQWait +\begin_inset LatexCommand \label{sec:eqwait} + +\end_inset + + +\layout LyX-Code + +int PtlEQWait( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQWait +\emph default + function can be used to block the calling process (thread) until there + is an event in an event queue. + This function also returns the next event in the event queue and removes + this event from the queue. + This is the only blocking operation in the Portals 3.2 API. + In the event that multiple threads are waiting on the same event queue, + PtlEQWait is guaranteed to wake exactly one thread, but the order in which + they are awakened is not specified. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + queue handle. + +\layout Subsubsection + +Arguments +\layout Standard +\noindent + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue to wait on. + The calling process (thread) will be blocked until +\family typewriter +eventq +\family default + is not empty. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Section + +The Access Control Table +\begin_inset LatexCommand \label{sec:ac} + +\end_inset + + +\layout Standard + +Processes can use the access control table to control which processes are + allowed to perform operations on Portal table entries. + Each communication interface has a Portal table and an access control table. + The access control table for the default interface contains an entry at + index zero that allows all processes with the same user id to communicate. + Entries in the access control table can be manipulated using the +\emph on +PtlACEntry +\emph default + function. +\layout Subsection + +PtlACEntry +\begin_inset LatexCommand \label{sec:acentry} + +\end_inset + + +\layout LyX-Code + +int PtlACEntry( ptl_handle_ni_t interface, +\newline + ptl_ac_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_uid_t user_id, +\newline + ptl_pt_index_t portal ); +\layout Standard +\noindent +The +\emph on +PtlACEntry +\emph default + function can be used to update an entry in the access control table for + an interface. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_AC_INV_INDEX Indicates that +\family typewriter +index +\family default + is not a valid access control table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_PT_INV_INDEX Indicates that +\family typewriter +portal +\family default + is not a valid Portal table index. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the interface to use. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index of the entry in the access control table to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the process(es) that are allowed to perform operations. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +user_id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the user that is allowed to perform operations. + The value +\family typewriter +PTL_UID_ANY +\family default + can be used to wildcard the user. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the Portal index(es) that can be used. + The value +\family typewriter +PTL_PT_INDEX_ANY +\family default + can be used to wildcard the Portal index. +\end_inset + + + + +\end_inset + + +\layout Section + +Data Movement Operations +\begin_inset LatexCommand \label{sec:datamovement} + +\end_inset + + +\layout Standard + +The Portals API provides two data movement operations: +\emph on +PtlPut +\emph default + and +\emph on +PtlGet +\emph default +. +\layout Subsection + +PtlPut +\begin_inset LatexCommand \label{sec:put} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t; +\newline + +\newline +int PtlPut( ptl_handle_md_t mem_desc, +\newline + ptl_ack_req_t ack_req, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset, +\newline + ptl_hdr_data_t hdr_data ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ack_req_t +\family default + are used to control whether an acknowledgement should be sent when the + operation completes (i.e., when the data has been written to a memory descriptor + of the +\family typewriter +target +\family default + process). + The value +\family typewriter +PTL_ACK_REQ +\family default + requests an acknowledgement, the value +\family typewriter +PTL_NOACK_REQ +\family default + requests that no acknowledgement should be generated. +\layout Standard + +The +\emph on +PtlPut +\emph default + function initiates an asynchronous put operation. + There are several events associated with a put operation: initiation of + the send on the local node ( +\family typewriter +PTL_EVENT_SEND_START +\family default +), completion of the send on the local node ( +\family typewriter +PTL_EVENT_SEND_END +\family default + or +\family typewriter +PTL_EVENT_SEND_FAIL +\family default +), and, when the send completes successfully, the receipt of an acknowledgement + ( +\family typewriter +PTL_EVENT_ACK +\family default +) indicating that the operation was accepted by the target. + These events will be logged in the event queue associated with the memory + descriptor ( +\family typewriter +mem_desc +\family default +) used in the put operation. + Using a memory descriptor that does not have an associated event queue + results in these events being discarded. + In this case, the application must have another mechanism (e.g., a higher + level protocol) for determining when it is safe to modify the memory region + associated with the memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory to be sent. + If the memory descriptor has an event queue associated with it, it will + be used to record events when the message has been sent (PTL_EVENT_SEND_START, + PTL_EVENT_SEND_END). + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ack_req +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Controls whether an acknowledgement event is requested. + Acknowledgements are only sent when they are requested by the initiating + process +\series bold +and +\series default + the memory descriptor has an event queue +\series bold +and +\series default + the target memory descriptor enables them. + Allowed constants: +\family typewriter +PTL_ACK_REQ +\family default +, +\family typewriter +PTL_NOACK_REQ +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +hdr_data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +64 bits of user data that can be included in message header. + This data is written to an event queue entry at the target if an event + queue is present on the matching memory descriptor. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlGet +\begin_inset LatexCommand \label{sec:get} + +\end_inset + + +\layout LyX-Code + +int PtlGet( ptl_handle_md_t mem_desc, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset ); +\layout Standard +\noindent +The +\emph on +PtlGet +\emph default + function initiates a remote read operation. + There are two event pairs associated with a get operation , when the data + is sent from the remote node, a +\family typewriter +PTL_EVENT_GET{START|END} +\family default + event pair is registered on the remote node; and when the data is returned + from the remote node a +\family typewriter +PTL_EVENT_REPLY{START|END} +\family default + event pair is registered on the local node. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory into which + the requested data will be received. + The memory descriptor can have an event queue associated with it to record + events, such as when the message receive has started ( +\family typewriter +PTL_EVENT_REPLY +\family default +_ +\family typewriter +START +\family default +). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\end_inset + + +\layout Section + +Summary +\layout Standard + + +\begin_inset LatexCommand \label{sec:summary} + +\end_inset + + We conclude this section by summarizing the names introduced by the Portals + 3.2 API. + We start by summarizing the names of the types introduced by the API. + This is followed by a summary of the functions introduced by the API. + Which is followed by a summary of the function return codes. + Finally, we conclude with a summary of the other constant values introduced + by the API. +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + + presents a summary of the types defined by the Portals API. + The first column in this table gives the type name, the second column gives + a brief description of the type, the third column identifies the section + where the type is defined, and the fourth column lists the functions that + have arguments of this type. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Types Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:types} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\noindent + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Sect +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Functions +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlACEntry, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +acknowledgement request types +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlPut +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +kinds of events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +information about events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +plt_seq_t +\end_inset + + +\begin_inset Text + +\layout Standard + +event sequence number +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_any_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for any object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for event queues +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert, + PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_me_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for match entries +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_ni_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut, + PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +node identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlGetId,PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +process identifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetId, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +user indentifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetUid, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +insertion position (before or after) +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +identifiers for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +match (and ignore) bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mb-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ni_fail_t +\end_inset + + +\begin_inset Text + +\layout Standard + +network interface-specific failures +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +process identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for Portal tables +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +sizes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:size-t} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_value_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +values in status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +unlink options +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + + presents a summary of the functions defined by the Portals API. + The first column in this table gives the name for the function, the second + column gives a brief description of the operation implemented by the function, + and the third column identifies the section where the function is defined. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Functions Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:func} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Operation +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlACEntry +\end_inset + + +\begin_inset Text + +\layout Standard + + update an entry in an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQAlloc +\end_inset + + +\begin_inset Text + +\layout Standard + + create an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQGet +\end_inset + + +\begin_inset Text + +\layout Standard + + get the next event from an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQFree +\end_inset + + +\begin_inset Text + +\layout Standard + + release the resources for an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQWait +\end_inset + + +\begin_inset Text + +\layout Standard + + wait for a new event in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGet +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a get operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGetId +\end_inset + + +\begin_inset Text + +\layout Standard + + get the id for the current process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDAttach +\end_inset + + +\begin_inset Text + +\layout Standard + + create a memory descriptor and attach it to a match entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDBind +\end_inset + + +\begin_inset Text + +\layout Standard + + create a free-floating memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a memory descriptor from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUpdate +\end_inset + + +\begin_inset Text + +\layout Standard + + update a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEAttach +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a Portal table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a free Portal table entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:attachany} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEInsert +\end_inset + + +\begin_inset Text + +\layout Standard + + create a match entry and insert it in a list +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a match entry from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIDist +\end_inset + + +\begin_inset Text + +\layout Standard + + get the distance to another process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIHandle +\end_inset + + +\begin_inset Text + +\layout Standard + + get the network interface handle for an object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIStatus +\end_inset + + +\begin_inset Text + +\layout Standard + + read a network interface status register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlPut +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a put operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + + summarizes the return codes used by functions defined by the Portals API. + All of these constants are integer values. + The first column of this table gives the symbolic name for the constant, + the second column gives a brief description of the value, and the third + column identifies the functions that can return this value. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Function Return Codes for the Portals 3.2 API +\begin_inset LatexCommand \label{tab:retcodes} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Functions +\series default + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_AC_INV_INDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_DROPPED +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +at least one event has been dropped +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet, PtlWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_EMPTY +\end_inset + + +\begin_inset Text + +\layout Standard + +no events available in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +error during initialization or cleanup +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlInit, PtlFini +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ILL_MD +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +illegal memory descriptor values +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDBind, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_DUP +\end_inset + + +\begin_inset Text + +\layout Standard + +duplicate initialization of an interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_INV +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +initialization of an invalid interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +the ME already has an MD +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ASIZE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table size +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_EQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUpdate, PtlEQFree, PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_HANDLE +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_MD +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid memory descriptor handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUnlink, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ME +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid match entry handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_NI +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid network interface handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PROC +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid process identifier +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PTINDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid Portal table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_REG +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_SR_INDX +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ML_TOOLONG +\end_inset + + +\begin_inset Text + +\layout Standard + +match list too long +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +MD has pending operations +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMDUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOINIT +\end_inset + + +\begin_inset Text + +\layout Standard + +uninitialized API +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\emph default +, except PtlInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOSPACE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +insufficient memory +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOUPDATE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + no update was performed +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_FULL +\end_inset + + +\begin_inset Text + +\layout Standard + +Portal table is full +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_OK +\end_inset + + +\begin_inset Text + +\layout Standard + + success +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SEGV +\end_inset + + +\begin_inset Text + +\layout Standard + +addressing violation +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate, + PtlEQAlloc, PtlEQGet, PtlEQWait +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + summarizes the remaining constant values introduced by the Portals API. + The first column in this table presents the symbolic name for the constant, + the second column gives a brief description of the value, the third column + identifies the type for the value, and the fourth column identifies the + sections in which the value is mentioned. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Other Constants Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:oconsts} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Base type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Intr. +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Ref. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ACK_REQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +request an acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_NONE +\end_inset + + +\begin_inset Text + +\layout Standard + +a NULL event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_START +\end_inset + + +\begin_inset Text + +\layout Standard + +get event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_END +\end_inset + + +\begin_inset Text + +\layout Standard + +get event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +get event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_START +\end_inset + + +\begin_inset Text + +\layout Standard + +put event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_END +\end_inset + + +\begin_inset Text + +\layout Standard + +put event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +put event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_START +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_END +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_START +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_END +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_START +\end_inset + + +\begin_inset Text + +\layout Standard + +send event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_END +\end_inset + + +\begin_inset Text + +\layout Standard + +send event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +send event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +unlink event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for process id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for node id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for user id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_IFACE_DEFAULT +\end_inset + + +\begin_inset Text + +\layout Standard + +default interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_AFTER +\end_inset + + +\begin_inset Text + +\layout Standard + +insert after +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_BEFORE +\end_inset + + +\begin_inset Text + +\layout Standard + +insert before +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_ACK_DISABLE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to disable acknowledgements +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_MANAGE_REMOTE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable the use of remote offsets +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_GET +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable get operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_PUT +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable put operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_THRESH_INF +\end_inset + + +\begin_inset Text + +\layout Standard + +infinite threshold for a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_TRUNCATE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable truncation of a request +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOACK_REQ +\end_inset + + +\begin_inset Text + +\layout Standard + +request no acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_INDEX_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for Portal indexes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_RETAIN +\end_inset + + +\begin_inset Text + +\layout Standard + +disable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SR_DROP_COUNT +\end_inset + + +\begin_inset Text + +\layout Standard + +index for the dropped count register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +enable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Chapter + +The Semantics of Message Transmission +\begin_inset LatexCommand \label{sec:semantics} + +\end_inset + + +\layout Standard + +The portals API uses four types of messages: put requests, acknowledgements, + get requests, and replies. + In this section, we describe the information passed on the wire for each + type of message. + We also describe how this information is used to process incoming messages. +\layout Section + +Sending Messages +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:put-wire} + +\end_inset + + summarizes the information that is transmitted for a put request. + The first column provides a descriptive name for the information, the second + column provides the type for this information, the third column identifies + the source of the information, and the fourth column provides additional + notes. + Most information that is transmitted is obtained directly from the +\emph on +PtlPut +\emph default + operation. + Notice that the handle for the memory descriptor used in the +\emph on +PtlPut +\emph default + operation is transmitted even though this value cannot be interpreted by + the target. + A value of anything other than +\family typewriter +PTL_MD_NONE +\family default +, is interpreted as a request for an acknowledgement. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Put Request +\begin_inset LatexCommand \label{tab:put-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlPut +\emph default + arg +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a put request +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +no ack if +\family typewriter +PTL_MD_NONE +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family roman +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +start +\family default + and +\family typewriter +length +\family default + members +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:ack-wire} + +\end_inset + + summarizes the information transmitted in an acknowledgement. + Most of the information is simply echoed from the put request. + Notice that the initiator and target are obtained directly from the put + request, but are swapped in generating the acknowledgement. + The only new piece of information in the acknowledgement is the manipulated + length which is determined as the put request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in an Acknowledgement +\begin_inset LatexCommand \label{tab:ack-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + length +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:get-wire} + +\end_inset + + summarizes the information that is transmitted for a get request. + Like the information transmitted in a put request, most of the information + transmitted in a get request is obtained directly from the +\emph on +PtlGet +\emph default + operation. + Unlike put requests, get requests do not include the event queue handle. + In this case, the reply is generated whenever the operation succeeds and + the memory descriptor must not be unlinked until the reply is received. + As such, there is no advantage to explicitly sending the event queue handle. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Get Request +\begin_inset LatexCommand \label{tab:get-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlGet +\emph default + argument +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a get operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:reply-wire} + +\end_inset + + summarizes the information transmitted in a reply. + Like an acknowledgement, most of the information is simply echoed from + the get request. + The initiator and target are obtained directly from the get request, but + are swapped in generating the acknowledgement. + The only new information in the acknowledgement are the manipulated length + and the data, which are determined as the get request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Reply +\begin_inset LatexCommand \label{tab:reply-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Section + +Receiving Messages +\begin_inset LatexCommand \label{sec:receiving} + +\end_inset + + +\layout Standard + +When an incoming message arrives on a network interface, the communication + system first checks that the target process identified in the request is + a valid process that has initialized the network interface (i.e., that the + target process has a valid Portal table). + If this test fails, the communication system discards the message and increment +s the dropped message count for the interface. + The remainder of the processing depends on the type of the incoming message. + Put and get messages are subject to access control checks and translation + (searching a match list), while acknowledgement and reply messages bypass + the access control checks and the translation step. +\layout Standard + +Acknowledgement messages include a handle for the memory descriptor used + in the original +\emph on +PtlPut +\emph default + operation. + This memory descriptor will identify the event queue where the event should + be recorded. + Upon receipt of an acknowledgement, the runtime system only needs to confirm + that the memory descriptor and event queue still exist and that there is + space for another event. + Should the any of these conditions fail, the message is simply discarded + and the dropped message count for the interface is incremented. + Otherwise, the system builds an acknowledgement event from the information + in the acknowledgement message and adds it to the event queue. +\layout Standard + +Reception of reply messages is also relatively straightforward. + Each reply message includes a handle for a memory descriptor. + If this descriptor exists, it is used to receive the message. + A reply message will be dropped if the memory descriptor identified in + the request doesn't exist. + In either of this case, the dropped message count for the interface is + incremented. + These are the only reasons for dropping reply messages. + Every memory descriptor accepts and truncates incoming reply messages, + eliminating the other potential reasons for rejecting a reply message. +\layout Standard + +The critical step in processing an incoming put or get request involves + mapping the request to a memory descriptor. + This step starts by using the Portal index in the incoming request to identify + a list of match entries. + This list of match entries is searched in order until a match entry is + found whose match criteria matches the match bits in the incoming request + and whose memory descriptor accepts the request. +\layout Standard + +Because acknowledge and reply messages are generated in response to requests + made by the process receiving these messages, the checks performed by the + runtime system for acknowledgements and replies are minimal. + In contrast, put and get messages are generated by remote processes and + the checks performed for these messages are more extensive. + Incoming put or get messages may be rejected because: +\layout Itemize + +the Portal index supplied in the request is not valid; +\layout Itemize + +the cookie supplied in the request is not a valid access control entry; + +\layout Itemize + +the access control entry identified by the cookie does not match the identifier + of the requesting process; +\layout Itemize + +the access control entry identified by the access control entry does not + match the Portal index supplied in the request; or +\layout Itemize + +the match bits supplied in the request do not match any of the match entries + with a memory descriptor that accepts the request. + +\layout Standard + +In all cases, if the message is rejected, the incoming message is discarded + and the dropped message count for the interface is incremented. +\layout Standard + +A memory descriptor may reject an incoming request for any of the following + reasons: +\layout Itemize + +the +\family typewriter +PTL_MD_PUT +\family default + or +\family typewriter +PTL_MD_GET +\family default + option has not been enabled and the operation is put or get, respectively; + +\layout Itemize + +the length specified in the request is too long for the memory descriptor + and the +\family typewriter +PTL_MD_TRUNCATE +\family default + option has not been enabled. +\layout Chapter + +Examples +\begin_inset LatexCommand \label{sec:examples} + +\end_inset + + +\layout Comment + +The examples presented in this chapter have not been updated to reflect + the current API. +\layout Standard + +In this section we present several example to illustrate expected usage + patterns for the Portals 3.2 API. + The first example describes how to implement parallel servers using the + features of the Portals 3.2 API. + This example covers the access control list and the use of remote managed + offsets. + The second example presents an approach to dealing with dropped requests. + This example covers aspects of match lists and memory descriptors. + The final example covers message reception in MPI. + This example illustrates more sophisticated uses of matching and a procedure + to update a memory descriptor. +\layout Section + +Parallel File Servers +\begin_inset LatexCommand \label{sec:expfs} + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:file} + +\end_inset + + illustrates the logical structure of a parallel file server. + In this case, the parallel server consists of four servers that stripe + application data across four disks. + We would like to present applications with the illusion that the file server + is a single entity. + We will assume that all of the processes that constitute the parallel server + have the same user id. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename file.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 196pt + lyxheight 147pt +\end_inset + + +\layout Caption + +Parallel File Server +\begin_inset LatexCommand \label{fig:file} + +\end_inset + + +\end_inset + + +\layout Standard + +When an application establishes a connection to the parallel file server, + it will allocate a Portal and access control list entry for communicating + with the server. + The access control list entry will include the Portal and match any process + in the parallel file server's, so all of the file server processes will + have access to the portal. + The Portal information and access control entry will be sent to the file + server at this time. + If the application and server need to have multiple, concurrent I/O operations, + they can use additional portals or match entries to keep the operations + from interfering with one another. +\layout Standard + +When an application initiates an I/O operation, it first builds a memory + descriptor that describes the memory region involved in the operation. + This memory descriptor will enable the appropriate operation (put for read + operations and get for write operations) and enable the use of remote offsets + (this lets the servers decide where their data should be placed in the + memory region). + After creating the memory descriptor and linking it into the appropriate + Portal entry, the application sends a read or write request (using +\emph on +PtlPut +\emph default +) to one of the file server processes. + The file server processes can then use put or get operations with the appropria +te offsets to fill or retrieve the contents of the application's buffer. + To know when the operation has completed, the application can add an event + queue to the memory descriptor and add up the lengths of the remote operations + until the sum is the size of the requested I/O operation. +\layout Section + +Dealing with Dropped Requests +\begin_inset LatexCommand \label{sec:exdrop} + +\end_inset + + +\layout Standard + +If a process does not anticipate unexpected requests, they will be discarded. + Applications using the Portals API can query the dropped count for the + interface to determine the number of requests that have been dropped (see + Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + While this approach minimizes resource consumption, it does not provide + information that might be critical in debugging the implementation of a + higher level protocol. +\layout Standard + +To keep track of more information about dropped requests, we use a memory + descriptor that truncates each incoming request to zero bytes and logs + the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + operations in an event queue. + Note that the operations are not dropped in the Portals sense, because + the operation succeeds. +\layout Standard + +The following code fragment illustrates an implementation of this approach. + In this case, we assume that a thread is launched to execute the function + +\family typewriter +watch_drop +\family default +. + This code starts by building an event queue to log truncated operations + and a memory descriptor to truncate the incoming requests. + This example only captures +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests for a single portal. + In a more realistic situation, the memory descriptor would be appended + to the match list for every portal. + We also assume that the thread is capable of keeping up with the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests. + If this is not the case, we could use a finite threshold on the memory + descriptor to capture the first few dropped requests. +\layout LyX-Code + + +\size small +#include +\newline +#include +\newline +#include +\newline + +\newline +#define DROP_SIZE 32 /* number of dropped requests to track */ +\newline + +\newline +int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) { +\newline + ptl_handle_eq_t drop_events; +\newline + ptl_event_t event; +\newline + ptl_handle_md_t drop_em; +\newline + ptl_md_t drop_desc; +\newline + ptl_process_id_t any_proc; +\newline + ptl_handle_me_t match_any; +\newline + +\newline + /* create the event queue */ +\newline + if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the event queue +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* build a match entry */ +\newline + any_proc.nid = PTL_ID_ANY; +\newline + any_proc.pid = PTL_ID_ANY; +\newline + PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN, +\newline + &match_any ); +\newline + +\newline + /* create the memory descriptor */ +\newline + drop_desc.start = NULL; +\newline + drop_desc.length = 0; +\newline + drop_desc.threshold = PTL_MD_THRESH_INF; +\newline + drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE; +\newline + drop_desc.user_ptr = NULL; +\newline + drop_desc.eventq = drop_events; +\newline + if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the memory descriptor +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* watch for "dropped" requests */ +\newline + while( 1 ) { +\newline + if( PtlEQWait( drop_events, &event ) != PTL_OK ) break; +\newline + fprintf( stderr, "Dropped request from gid = event.initiator.gid, + event.initiator.rid ); +\newline + } +\newline +} +\layout Section + +Message Transmission in MPI +\begin_inset LatexCommand \label{sec:exmpi} + +\end_inset + + +\layout Standard + +We conclude this section with a fairly extensive example that describes + an approach to implementing message transmission for MPI. + Like many MPI implementations, we distinguish two message transmission + protocols: a short message protocol and a long message protocol. + We use the constant +\family typewriter +MPI_LONG_LENGTH +\family default + to determine the size of a long message. +\layout Standard + +For small messages, the sender simply sends the message and presumes that + the message will be received (i.e., the receiver has allocated a memory region + to receive the message body). + For large messages, the sender also sends the message, but does not presume + that the message body will be saved. + Instead, the sender builds a memory descriptor for the message and enables + get operations on this descriptor. + If the target does not save the body of the message, it will record an + event for the put operation. + When the process later issues a matching MPI receive, it will perform a + get operation to retrieve the body of the message. +\layout Standard + +To facilitate receive side matching based on the protocol, we use the most + significant bit in the match bits to indicate the protocol: 1 for long + messages and 0 for short messages. +\layout Standard + +The following code presents a function that implements the send side of + the protocol. + The global variable +\family typewriter +EndGet +\family default + is the last match entry attached to the Portal index used for posting long + messages. + This entry does not match any incoming requests (i.e., the memory descriptor + rejects all get operations) and is built during initialization of the MPI + library. + The other global variable, +\family typewriter +MPI_NI +\family default +, is a handle for the network interface used by the MPI implementation. +\layout LyX-Code + + +\size small +extern ptl_handle_me_t EndGet; +\newline +extern ptl_handle_ni_t MPI_NI; +\newline + +\newline +void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq, +\newline + ptl_process_id target, ptl_match_bits_t match ) +\newline +{ +\newline + ptl_handle_md_t send_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_ack_req_t want_ack; +\newline + +\newline + mem_desc.start = buf; +\newline + mem_desc.length = len; +\newline + mem_desc.threshold = 1; +\newline + mem_desc.options = PTL_MD_GET_OP; +\newline + mem_desc.user_ptr = data; +\newline + mem_desc.eventq = eventq; +\newline + +\newline + if( len >= MPI_LONG_LENGTH ) { +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + /* add a match entry to the end of the get list */ +\newline + PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet, + &me_handle ); +\newline + PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL ); +\newline + +\newline + /* we want an ack for long messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a long message + */ +\newline + match |= 1<<63; +\newline + } else { +\newline + /* we don't want an ack for short messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a short message + */ +\newline + match &= ~(1<<63); +\newline + } +\newline + +\newline + /* create a memory descriptor and send it */ +\newline + PtlMDBind( MPI_NI, mem_desc, &send_handle ); +\newline + PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match, + 0 ); +\newline +} +\layout Standard + +The +\emph on +MPISend +\emph default + function returns as soon as the message has been scheduled for transmission. + The event queue argument, +\family typewriter +eventq +\family default +, can be used to determine the disposition of the message. + Assuming that +\family typewriter +eventq +\family default + is not +\family typewriter +PTL_EQ_NONE +\family default +, a +\family typewriter +PTL_EVENT_SENT +\family default + event will be recorded for each message as the message is transmitted. + For small messages, this is the only event that will be recorded in +\family typewriter +eventq +\family default +. + In contrast, long messages include an explicit request for an acknowledgement. + If the +\family typewriter +target +\family default + process has posted a matching receive, the acknowledgement will be sent + as the message is received. + If a matching receive has not been posted, the message will be discarded + and no acknowledgement will be sent. + When the +\family typewriter +target +\family default + process later issues a matching receive, the receive will be translated + into a get operation and a +\family typewriter +PTL_EVENT_GET +\family default + event will be recorded in +\family typewriter +eventq +\family default +. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:mpi} + +\end_inset + + illustrates the organization of the match list used for receiving MPI messages. + The initial entries (not shown in this figure) would be used to match the + MPI receives that have been preposted by the application. + The preposted receives are followed by a match entry, +\emph on +RcvMark +\emph default +, that marks the boundary between preposted receives and the memory descriptors + used for +\begin_inset Quotes eld +\end_inset + +unexpected +\begin_inset Quotes erd +\end_inset + + messages. + The +\emph on +RcvMark +\emph default + entry is followed by a small collection of match entries that match unexpected + +\begin_inset Quotes eld +\end_inset + +short +\begin_inset Quotes erd +\end_inset + + messages, i.e., messages that have a 0 in the most significant bit of their + match bits. + The memory descriptors associated with these match entries will append + the incoming message to the associated memory descriptor and record an + event in an event queue for unexpected messages. + The unexpected short message matching entries are followed by a match entry + that will match messages that were not matched by the preceding match entries, + i.e., the unexpected long messages. + The memory descriptor associated with this match entry truncates the message + body and records an event in the event queue for unexpected messages. + Note that of the memory descriptors used for unexpected messages share + a common event queue. + This makes it possible to process the unexpected messages in the order + in which they arrived, regardless of. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename mpi.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 389pt + lyxheight 284pt +\end_inset + + +\layout Caption + +Message Reception in MPI +\begin_inset LatexCommand \label{fig:mpi} + +\end_inset + + +\end_inset + + +\layout Standard + +When the local MPI process posts an MPI receive, we must first search the + events unexpected message queue to see if a matching message has already + arrived. + If no matching message is found, a match entry for the receive is inserted + before the +\emph on +RcvMark +\emph default + entry--after the match entries for all of the previously posted receives + and before the match entries for the unexpected messages. + This ensures that preposted receives are matched in the order that they + were posted (a requirement of MPI). + +\layout Standard + +While this strategy respects the temporal semantics of MPI, it introduces + a race condition: a matching message might arrive after the events in the + unexpected message queue have been searched, but before the match entry + for the receive has been inserted in the match list. + +\layout Standard + +To avoid this race condition we start by setting the +\family typewriter +threshold +\family default + of the memory descriptor to 0, making the descriptor inactive. + We then insert the match entry into the match list and proceed to search + the events in the unexpected message queue. + A matching message that arrives as we are searching the unexpected message + queue will not be accepted by the memory descriptor and, if not matched + by an earlier match list element, will add an event to the unexpected message + queue. + After searching the events in the unexpected message queue, we update the + memory descriptor, setting the threshold to 1 to activate the memory descriptor. + This update is predicated by the condition that the unexpected message + queue is empty. + We repeat the process of searching the unexpected message queue until the + update succeeds. +\layout Standard + +The following code fragment illustrates this approach. + Because events must be removed from the unexpected message queue to be + examined, this code fragment assumes the existence of a user managed event + list, +\family typewriter +Rcvd +\family default +, for the events that have already been removed from the unexpected message + queue. + In an effort to keep the example focused on the basic protocol, we have + omitted the code that would be needed to manage the memory descriptors + used for unexpected short messages. + In particular, we simply leave messages in these descriptors until they + are received by the application. + In a robust implementation, we would introduce code to ensure that short + unexpected messages are removed from these memory descriptors so that they + can be re-used. +\layout LyX-Code + + +\size small +extern ptl_handle_eq_t UnexpQueue; +\newline +extern ptl_handle_me_t RcvMark; +\newline +extern ptl_handle_me_t ShortMatch; +\newline + +\newline +typedef struct event_list_tag { +\newline + ptl_event_t event; +\newline + struct event_list_tag* next; +\newline +} event_list; +\newline + +\newline +extern event_list Rcvd; +\newline + +\newline +void AppendRcvd( ptl_event_t event ) +\newline +{ +\newline + /* append an event onto the Rcvd list */ +\newline +} +\newline + +\newline +int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi +ts_t match, +\newline + ptl_match_bits_t ignore, ptl_event_t *event ) +\newline +{ +\newline + /* Search the Rcvd event queue, looking for a message that matches the + requested message. +\newline + * If one is found, remove the event from the Rcvd list and return it. + */ +\newline +} +\newline + +\newline +typedef enum { RECEIVED, POSTED } receive_state; +\newline + +\newline +receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event, + ptl_md_t md_buf ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + if( event.rlength >= MPI_LONG_LENGTH ) { +\newline + PtlMDBind( MPI_NI, md_buf, &md_handle ); +\newline + PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX, + md_handle ); +\newline + return POSTED; +\newline + } else { +\newline + /* copy the message */ +\newline + if( event.mlength < *length ) *length = event.mlength; +\newline + memcpy( buf, (char*)event.md_desc.start+event.offset, *length ); +\newline + return RECEIVED; +\newline + } +\newline +} +\newline + +\newline +receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle +_eq_t eventq, +\newline + ptl_process_id_t sender, ptl_match_bits_t match, + ptl_match_bits_t ignore ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_handle_me_t me_handle; +\newline + ptl_event_t event; +\newline + +\newline + /* build a memory descriptor for the receive */ +\newline + md_buf.start = buf; +\newline + md_buf.length = *len; +\newline + md_buf.threshold = 0; /* temporarily disabled */ +\newline + md_buf.options = PTL_MD_PUT_OP; +\newline + md_buf.user_ptr = MPI_data; +\newline + md_buf.eventq = eventq; +\newline + +\newline + /* see if we have already received the message */ +\newline + if( SearchRcvd(buf, len, sender, match, ignore, &event) ) +\newline + return CopyMsg( buf, len, event, md_buf ); +\newline + +\newline + /* create the match entry and attach the memory descriptor */ +\newline + PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark, + &me_handle); +\newline + PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle ); +\newline + +\newline + md_buf.threshold = 1; +\newline + do +\newline + if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) { +\newline + if( MPIMatch(event, match, ignore, sender) ) { +\newline + return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset, + md_buf ); +\newline + } else { +\newline + AppendRcvd( event ); +\newline + } +\newline + } +\newline + while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE + ); +\newline + return POSTED; +\newline +} +\layout Chapter* + +Acknowledgments +\layout Standard + +Several people have contributed to the philosophy, design, and implementation + of the Portals message passing architecture as it has evolved. + We acknowledge the following people for their contributions: Al Audette, + Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike + Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke, + Dave van Dresser, Lee Ward, and Stephen Wheat. + +\layout Standard + + +\begin_inset LatexCommand \BibTeX[ieee]{portals3} + +\end_inset + + +\the_end diff --git a/lustre/portals/doc/put.fig b/lustre/portals/doc/put.fig new file mode 100644 index 0000000..5235b6d --- /dev/null +++ b/lustre/portals/doc/put.fig @@ -0,0 +1,32 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1350 900 2175 1200 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1275 2700 1725 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 1200 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2699 1788 899 1938 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001 +4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001 +4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am new file mode 100644 index 0000000..2cf7f99 --- /dev/null +++ b/lustre/portals/include/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = portals linux +EXTRA_DIST = config.h.in +include $(top_srcdir)/Rules diff --git a/lustre/portals/include/config.h.in b/lustre/portals/include/config.h.in new file mode 100644 index 0000000..b05d0c4 --- /dev/null +++ b/lustre/portals/include/config.h.in @@ -0,0 +1,11 @@ +/* ../include/config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if you have the readline library (-lreadline). */ +#undef HAVE_LIBREADLINE + +/* Name of package */ +#undef PACKAGE + +/* Version number of package */ +#undef VERSION + diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am new file mode 100644 index 0000000..6a65cb5 --- /dev/null +++ b/lustre/portals/include/linux/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(top_srcdir)/Rules + +linuxincludedir = $(includedir)/linux + +linuxinclude_HEADERS=kp30.h portals_lib.h diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h new file mode 100644 index 0000000..4915fe3 --- /dev/null +++ b/lustre/portals/include/linux/kp30.h @@ -0,0 +1,936 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _KP30_INCLUDED +#define _KP30_INCLUDED + + +#define PORTAL_DEBUG + +#ifndef offsetof +# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) +#endif + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +#ifndef CONFIG_SMP +# define smp_processor_id() 0 +#endif + +/* + * Debugging + */ +extern unsigned int portal_subsystem_debug; +extern unsigned int portal_stack; +extern unsigned int portal_debug; +extern unsigned int portal_printk; +/* Debugging subsystems (8 bit ID) + * + * If you add debug subsystem #32, you need to send email to phil, because + * you're going to break kernel subsystem debug filtering. */ +#define S_UNDEFINED (0 << 24) +#define S_MDC (1 << 24) +#define S_MDS (2 << 24) +#define S_OSC (3 << 24) +#define S_OST (4 << 24) +#define S_CLASS (5 << 24) +#define S_OBDFS (6 << 24) /* obsolete */ +#define S_LLITE (7 << 24) +#define S_RPC (8 << 24) +#define S_EXT2OBD (9 << 24) /* obsolete */ +#define S_PORTALS (10 << 24) +#define S_SOCKNAL (11 << 24) +#define S_QSWNAL (12 << 24) +#define S_PINGER (13 << 24) +#define S_FILTER (14 << 24) +#define S_TRACE (15 << 24) /* obsolete */ +#define S_ECHO (16 << 24) +#define S_LDLM (17 << 24) +#define S_LOV (18 << 24) +#define S_GMNAL (19 << 24) +#define S_PTLROUTER (20 << 24) +#define S_COBD (21 << 24) +#define S_PTLBD (22 << 24) +#define S_LOG (23 << 24) + +/* If you change these values, please keep portals/linux/utils/debug.c + * up to date! */ + +/* Debugging masks (24 bits, non-overlapping) */ +#define D_TRACE (1 << 0) /* ENTRY/EXIT markers */ +#define D_INODE (1 << 1) +#define D_SUPER (1 << 2) +#define D_EXT2 (1 << 3) /* anything from ext2_debug */ +#define D_MALLOC (1 << 4) /* print malloc, free information */ +#define D_CACHE (1 << 5) /* cache-related items */ +#define D_INFO (1 << 6) /* general information */ +#define D_IOCTL (1 << 7) /* ioctl related information */ +#define D_BLOCKS (1 << 8) /* ext2 block allocation */ +#define D_NET (1 << 9) /* network communications */ +#define D_WARNING (1 << 10) +#define D_BUFFS (1 << 11) +#define D_OTHER (1 << 12) +#define D_DENTRY (1 << 13) +#define D_PORTALS (1 << 14) /* ENTRY/EXIT markers */ +#define D_PAGE (1 << 15) /* bulk page handling */ +#define D_DLMTRACE (1 << 16) +#define D_ERROR (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA (1 << 19) /* recovery and failover */ +#define D_RPCTRACE (1 << 20) /* for distributed debugging */ +#define D_VFSTRACE (1 << 21) + +#ifndef THREAD_SIZE +#define THREAD_SIZE 8192 +#endif +#ifdef __arch_ia64__ +#define CDEBUG_STACK(var) (&var & (THREAD_SIZE - 1)) +#else +#define CDEBUG_STACK(var) (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0)& \ + (THREAD_SIZE - 1))) +#endif + +#ifdef __KERNEL__ +#define CHECK_STACK(stack) \ + do { \ + if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) \ + portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR, \ + __FILE__, __FUNCTION__, __LINE__, \ + (stack), \ + "maximum lustre stack %u\n", \ + portal_stack = (stack)); \ + } while (0) +#else +#define CHECK_STACK(stack) do{}while(0) +#endif + +#define CDEBUG(mask, format, a...) \ +do { \ + unsigned long stack = CDEBUG_STACK(stack); \ + int match = 0; \ + \ + CHECK_STACK(stack); \ + if (!(mask)) \ + match = 1; \ + else if ((mask) & (D_ERROR | D_EMERG)) \ + match = 1; \ + else if (portal_debug & (mask) && \ + portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24))) \ + match = 1; \ + if (match) \ + portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + stack, format , ## a); \ +} while (0) + +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) + +#define GOTO(label, rc) \ +do { \ + long GOTO__ret = (long)(rc); \ + CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \ + #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\ + (signed long)GOTO__ret); \ + goto label; \ +} while (0) + +#define RETURN(rc) \ +do { \ + typeof(rc) RETURN__ret = (rc); \ + long tmp = (long)RETURN__ret; \ + CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ + (unsigned long)tmp, (signed long)tmp, \ + (signed long)tmp); \ + return RETURN__ret; \ +} while (0) + +#define ENTRY \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ +} while(0) + + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define schedule_work schedule_task +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_TQUEUE((wq), 0, 0); \ + PREPARE_TQUEUE((wq), (cb), (cbdata)); \ +} while (0) + +#define ll_invalidate_inode_pages invalidate_inode_pages +#define PageUptodate Page_Uptodate +#define our_recalc_sigpending(current) recalc_sigpending(current) +#define num_online_cpus() smp_num_cpus +static inline void our_cond_resched(void) +{ + if (current->need_resched) + schedule (); +} + +#else + +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_WORK((wq), (void *)(cb), (void *)(cbdata)); \ +} while (0) +#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping) +#define wait_on_page wait_on_page_locked +#define our_recalc_sigpending(current) recalc_sigpending() +#define strtok(a,b) strpbrk(a, b) +static inline void our_cond_resched(void) +{ + cond_resched(); +} +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ + +#ifdef PORTAL_DEBUG +extern void kportal_assertion_failed(char *expr,char *file,char *func,int line); +#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ + __FUNCTION__, __LINE__)) +#else +#define LASSERT(e) +#endif + +#ifdef __arch_um__ +#define LBUG() \ +do { \ + CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__); \ + panic("LBUG"); \ +} while (0) +#else +#define LBUG() \ +do { \ + CEMERG("LBUG\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__); \ + set_task_state(current, TASK_UNINTERRUPTIBLE); \ + schedule(); \ +} while (0) +#endif /* __arch_um__ */ + +/* + * Memory + */ +#ifdef PORTAL_DEBUG +extern atomic_t portal_kmemory; + +# define portal_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &portal_kmemory); \ +} while (0) + +# define portal_kmem_dec(ptr, size) do { \ + atomic_sub(size, &portal_kmemory); \ +} while (0) + +#else +# define portal_kmem_inc(ptr, size) do {} while (0) +# define portal_kmem_dec(ptr, size) do {} while (0) +#endif /* PORTAL_DEBUG */ + +#define PORTAL_VMALLOC_SIZE 16384 + +#define PORTAL_ALLOC(ptr, size) \ +do { \ + long s = size; \ + LASSERT (!in_interrupt()); \ + if (s > PORTAL_VMALLOC_SIZE) \ + (ptr) = vmalloc(s); \ + else \ + (ptr) = kmalloc(s, GFP_KERNEL); \ + if ((ptr) == NULL) \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s); \ + else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_FREE(ptr, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + if (s > PORTAL_VMALLOC_SIZE) \ + vfree(ptr); \ + else \ + kfree(ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_ALLOC(ptr, slab, size) \ +do { \ + long s = (size); \ + LASSERT (!in_interrupt()); \ + (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' from slab '" #slab "')\n", __FILE__, \ + __LINE__); \ + } else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_FREE(ptr, slab, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + memset((ptr), 0x5a, s); \ + kmem_cache_free((slab), ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +/* ------------------------------------------------------------------- */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + +#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x) +#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x) + +#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x)) +#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) + +#define PORTAL_MODULE_USE MOD_INC_USE_COUNT +#define PORTAL_MODULE_UNUSE MOD_DEC_USE_COUNT +#else + +#define PORTAL_SYMBOL_REGISTER(x) +#define PORTAL_SYMBOL_UNREGISTER(x) + +#define PORTAL_SYMBOL_GET(x) symbol_get(x) +#define PORTAL_SYMBOL_PUT(x) symbol_put(x) + +#define PORTAL_MODULE_USE try_module_get(THIS_MODULE) +#define PORTAL_MODULE_UNUSE module_put(THIS_MODULE) + +#endif + +/******************************************************************************/ +/* Kernel Portals Router interface */ + +typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback + +/* space for routing targets to stash "stuff" in a forwarded packet */ +typedef union { + long long _alignment; + void *_space[16]; /* scale with CPU arch */ +} kprfd_scratch_t; + +/* Kernel Portals Routing Forwarded message Descriptor */ +typedef struct { + struct list_head kprfd_list; /* stash in queues (routing target can use) */ + ptl_nid_t kprfd_target_nid; /* final destination NID */ + ptl_nid_t kprfd_gateway_nid; /* gateway NID */ + int kprfd_nob; /* # message bytes (including header) */ + int kprfd_niov; /* # message frags (including header) */ + struct iovec *kprfd_iov; /* message fragments */ + void *kprfd_router_arg; // originating NAL's router arg + kpr_fwd_callback_t kprfd_callback; /* completion callback */ + void *kprfd_callback_arg; /* completion callback arg */ + kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets +} kpr_fwd_desc_t; + +typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); + +/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ +typedef const struct { + int kprni_nalid; /* NAL's id */ + void *kprni_arg; /* Arg to pass when calling into NAL */ + kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ +} kpr_nal_interface_t; + +/* Router's routing interface (Kernel Portals Routing Router Interface) */ +typedef const struct { + /* register the calling NAL with the router and get back the handle for + * subsequent calls */ + int (*kprri_register) (kpr_nal_interface_t *nal_interface, + void **router_arg); + + /* ask the router to find a gateway that forwards to 'nid' and is a peer + * of the calling NAL */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + ptl_nid_t *gateway_nid); + + /* hand a packet over to the router for forwarding */ + kpr_fwd_t kprri_fwd_start; + + /* hand a packet back to the router for completion */ + void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, + int error); + + /* the calling NAL is shutting down */ + void (*kprri_shutdown) (void *router_arg); + + /* deregister the calling NAL with the router */ + void (*kprri_deregister) (void *router_arg); + +} kpr_router_interface_t; + +/* Convenient struct for NAL to stash router interface/args */ +typedef struct { + kpr_router_interface_t *kpr_interface; + void *kpr_arg; +} kpr_router_t; + +/* Router's control interface (Kernel Portals Routing Control Interface) */ +typedef const struct { + int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); + int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_get_route)(int index, int *gateway_nal, + ptl_nid_t *gateway, ptl_nid_t *lo_nid, + ptl_nid_t *hi_nid); +} kpr_control_interface_t; + +extern kpr_control_interface_t kpr_control_interface; +extern kpr_router_interface_t kpr_router_interface; + +static inline int +kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) +{ + int rc; + + router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); + if (router->kpr_interface == NULL) + return (-ENOENT); + + rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); + if (rc != 0) + router->kpr_interface = NULL; + + PORTAL_SYMBOL_PUT (kpr_router_interface); + return (rc); +} + +static inline int +kpr_routing (kpr_router_t *router) +{ + return (router->kpr_interface != NULL); +} + +static inline int +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +{ + if (!kpr_routing (router)) + return (-EHOSTUNREACH); + + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + gateway_nid)); +} + +static inline void +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, + int nob, int niov, struct iovec *iov, + kpr_fwd_callback_t callback, void *callback_arg) +{ + fwd->kprfd_target_nid = nid; + fwd->kprfd_gateway_nid = nid; + fwd->kprfd_nob = nob; + fwd->kprfd_niov = niov; + fwd->kprfd_iov = iov; + fwd->kprfd_callback = callback; + fwd->kprfd_callback_arg = callback_arg; +} + +static inline void +kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) +{ + if (!kpr_routing (router)) + fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + else + router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); +} + +static inline void +kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) +{ + LASSERT (kpr_routing (router)); + router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); +} + +static inline void +kpr_shutdown (kpr_router_t *router) +{ + if (kpr_routing (router)) + router->kpr_interface->kprri_shutdown (router->kpr_arg); +} + +static inline void +kpr_deregister (kpr_router_t *router) +{ + if (!kpr_routing (router)) + return; + router->kpr_interface->kprri_deregister (router->kpr_arg); + router->kpr_interface = NULL; +} + +/******************************************************************************/ + +#ifdef PORTALS_PROFILING +#define prof_enum(FOO) PROF__##FOO +enum { + prof_enum(our_recvmsg), + prof_enum(our_sendmsg), + prof_enum(socknal_recv), + prof_enum(lib_parse), + prof_enum(conn_list_walk), + prof_enum(memcpy), + prof_enum(lib_finalize), + prof_enum(pingcli_time), + prof_enum(gmnal_send), + prof_enum(gmnal_recv), + MAX_PROFS +}; + +struct prof_ent { + char *str; + /* hrmph. wrap-tastic. */ + u32 starts; + u32 finishes; + cycles_t total_cycles; + cycles_t start; + cycles_t end; +}; + +extern struct prof_ent prof_ents[MAX_PROFS]; + +#define PROF_START(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->starts++; \ + pe->start = get_cycles(); \ + } while (0) + +#define PROF_FINISH(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->finishes++; \ + pe->end = get_cycles(); \ + pe->total_cycles += (pe->end - pe->start); \ + } while (0) +#else /* !PORTALS_PROFILING */ +#define PROF_START(FOO) do {} while(0) +#define PROF_FINISH(FOO) do {} while(0) +#endif /* PORTALS_PROFILING */ + +/* debug.c */ +void portals_run_lbug_upcall(char * file, char *fn, int line); +void portals_debug_dumplog(void); +int portals_debug_init(unsigned long bufsize); +int portals_debug_cleanup(void); +int portals_debug_clear_buffer(void); +int portals_debug_mark_buffer(char *text); +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *file, unsigned int size); +__s32 portals_debug_copy_to_user(char *buf, unsigned long len); +#if (__GNUC__) +/* Use the special GNU C __attribute__ hack to have the compiler check the + * printf style argument string against the actual argument count and + * types. + */ +#ifdef printf +# warning printf has been defined as a macro... +# undef printf +#endif +void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) + __attribute__ ((format (printf, 7, 8))); +#else +void portals_debug_msg (int subsys, int mask, char *file, char *fn, + int line, unsigned long stack, + const char *format, ...); +#endif /* __GNUC__ */ +void portals_debug_set_level(unsigned int debug_level); + +# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) +# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) +# define time(a) CURRENT_TIME + +extern void kportal_daemonize (char *name); +extern void kportal_blockallsigs (void); + +#else /* !__KERNEL__ */ +# include +# include +#ifndef __CYGWIN__ +# include +#endif +# include +# include +# include +# ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +# endif +# ifdef PORTAL_DEBUG +# undef NDEBUG +# include +# define LASSERT(e) assert(e) +# else +# define LASSERT(e) +# endif +# define printk(format, args...) printf (format, ## args) +# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); +# define PORTAL_FREE(a, b) do { free(a); } while (0); +# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ + printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ + (subsys) >> 24, (mask), (long)time(0), file, fn, line, \ + getpid() , stack, ## a); +#endif + +#ifndef CURRENT_TIME +# define CURRENT_TIME time(0) +#endif + +#include + +/* + * USER LEVEL STUFF BELOW + */ + +#define PORTAL_IOCTL_VERSION 0x00010007 +#define PING_SYNC 0 +#define PING_ASYNC 1 + +struct portal_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + __u64 ioc_nid; + __u64 ioc_nid2; + __u64 ioc_nid3; + __u32 ioc_count; + __u32 ioc_nal; + __u32 ioc_nal_cmd; + __u32 ioc_fd; + __u32 ioc_id; + + __u32 ioc_flags; + __u32 ioc_size; + + __u32 ioc_wait; + __u32 ioc_timeout; + __u32 ioc_misc; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +struct portal_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct portals_debug_ioctl_data +{ + struct portal_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define PORTAL_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = PORTAL_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME check conflict with lustre_lib.h */ +#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) + +static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) +{ + int len = sizeof(*data); + len += size_round(data->ioc_inllen1); + len += size_round(data->ioc_inllen2); + return len; +} + +static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if (portal_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("PORTALS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + +#ifndef __KERNEL__ +static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, + int max) +{ + char *ptr; + struct portal_ioctl_data *overlay; + data->ioc_len = portal_ioctl_packlen(data); + data->ioc_version = PORTAL_IOCTL_VERSION; + + if (*pbuf && portal_ioctl_packlen(data) > max) + return 1; + if (*pbuf == NULL) { + *pbuf = malloc(data->ioc_len); + } + if (!*pbuf) + return 1; + overlay = (struct portal_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + if (data->ioc_inlbuf2) + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + if (portal_ioctl_is_invalid(overlay)) + return 1; + + return 0; +} +#else +#include + +/* buffer MUST be at least the size of portal_ioctl_hdr */ +static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct portal_ioctl_hdr *hdr; + struct portal_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct portal_ioctl_hdr *)buf; + data = (struct portal_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if ( err ) { + EXIT; + return err; + } + + if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { + CERROR ("PORTALS: version mismatch kernel vs application\n"); + return -EINVAL; + } + + if (hdr->ioc_len + buf >= end) { + CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); + return -EINVAL; + } + + + if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { + CERROR ("PORTALS: user buffer too small for ioctl\n"); + return -EINVAL; + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if ( err ) { + EXIT; + return err; + } + + if (portal_ioctl_is_invalid(data)) { + CERROR ("PORTALS: ioctl not correctly formatted\n"); + return -EINVAL; + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); + } + + EXIT; + return 0; +} +#endif + +/* ioctls for manipulating snapshots 30- */ +#define IOC_PORTAL_TYPE 'e' +#define IOC_PORTAL_MIN_NR 30 + +#define IOC_PORTAL_PING _IOWR('e', 30, long) +#define IOC_PORTAL_GET_DEBUG _IOWR('e', 31, long) +#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long) +#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long) +#define IOC_PORTAL_PANIC _IOWR('e', 34, long) +#define IOC_PORTAL_ADD_ROUTE _IOWR('e', 35, long) +#define IOC_PORTAL_DEL_ROUTE _IOWR('e', 36, long) +#define IOC_PORTAL_GET_ROUTE _IOWR('e', 37, long) +#define IOC_PORTAL_NAL_CMD _IOWR('e', 38, long) +#define IOC_PORTAL_GET_NID _IOWR('e', 39, long) +#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) +#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) + +#define IOC_PORTAL_MAX_NR 41 + +enum { + QSWNAL = 1, + SOCKNAL, + GMNAL, + TOENAL, + TCPNAL, + SCIMACNAL, + NAL_ENUM_END_MARKER +}; + +#ifdef __KERNEL__ +extern ptl_handle_ni_t kqswnal_ni; +extern ptl_handle_ni_t ksocknal_ni; +extern ptl_handle_ni_t ktoenal_ni; +extern ptl_handle_ni_t kgmnal_ni; +extern ptl_handle_ni_t kscimacnal_ni; +#endif + +#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) + +#define NAL_CMD_REGISTER_PEER_FD 100 +#define NAL_CMD_CLOSE_CONNECTION 101 +#define NAL_CMD_REGISTER_MYNID 102 +#define NAL_CMD_PUSH_CONNECTION 103 + +enum { + DEBUG_DAEMON_START = 1, + DEBUG_DAEMON_STOP = 2, + DEBUG_DAEMON_PAUSE = 3, + DEBUG_DAEMON_CONTINUE = 4, +}; + +/* XXX remove to lustre ASAP */ +struct lustre_peer { + ptl_nid_t peer_nid; + ptl_handle_ni_t peer_ni; +}; + +/* module.c */ +typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private); +int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private); +int kportal_nal_unregister(int nal); + +ptl_handle_ni_t *kportal_get_ni (int nal); +void kportal_put_ni (int nal); + +#ifdef __CYGWIN__ +#ifndef BITS_PER_LONG +#if (~0UL) == 0xffffffffUL +#define BITS_PER_LONG 32 +#else +#define BITS_PER_LONG 64 +#endif +#endif +#endif + +#if (BITS_PER_LONG == 32 || __WORDSIZE == 32) +# define LPU64 "%Lu" +# define LPD64 "%Ld" +# define LPX64 "%#Lx" +# define LPSZ "%u" +# define LPSSZ "%d" +#endif +#if (BITS_PER_LONG == 64 || __WORDSIZE == 64) +# define LPU64 "%lu" +# define LPD64 "%ld" +# define LPX64 "%#lx" +# define LPSZ "%lu" +# define LPSSZ "%ld" +#endif +#ifndef LPU64 +# error "No word size defined" +#endif + +#endif diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h new file mode 100644 index 0000000..a528a80 --- /dev/null +++ b/lustre/portals/include/linux/portals_lib.h @@ -0,0 +1,188 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _PORTALS_LIB_H +#define _PORTALS_LIB_H + +#ifndef __KERNEL__ +# include +#else +# include +#endif + +#undef MIN +#define MIN(a,b) (((a)<(b)) ? (a): (b)) +#undef MAX +#define MAX(a,b) (((a)>(b)) ? (a): (b)) +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int size_round (int val) +{ + return (val + 7) & (~0x7); +} + +static inline int size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t round_strlen(char *fset) +{ + return size_round(strlen(fset) + 1); +} + +#ifdef __KERNEL__ +static inline char *strdup(const char *str) +{ + int len = strlen(str) + 1; + char *tmp = kmalloc(len, GFP_KERNEL); + if (tmp) + memcpy(tmp, str, len); + + return tmp; +} +#endif + +#ifdef __KERNEL__ +# define NTOH__u32(var) le32_to_cpu(var) +# define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u32(var) cpu_to_le32(var) +# define HTON__u64(var) cpu_to_le64(var) +#else +# define expansion_u64(var) \ + ({ __u64 ret; \ + switch (sizeof(var)) { \ + case 8: (ret) = (var); break; \ + case 4: (ret) = (__u32)(var); break; \ + case 2: (ret) = (__u16)(var); break; \ + case 1: (ret) = (__u8)(var); break; \ + }; \ + (ret); \ + }) +# define NTOH__u32(var) (var) +# define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u32(var) (var) +# define HTON__u64(var) (expansion_u64(var)) +#endif + +/* + * copy sizeof(type) bytes from pointer to var and move ptr forward. + * return EFAULT if pointer goes beyond end + */ +#define UNLOGV(var,type,ptr,end) \ +do { \ + var = *(type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* the following two macros convert to little endian */ +/* type MUST be __u32 or __u64 */ +#define LUNLOGV(var,type,ptr,end) \ +do { \ + var = NTOH##type(*(type *)ptr); \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* now log values */ +#define LOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = var; \ + ptr += sizeof(type); \ +} while (0) + +/* and in network order */ +#define LLOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = HTON##type(var); \ + ptr += sizeof(type); \ +} while (0) + + +/* + * set var to point at (type *)ptr, move ptr forward with sizeof(type) + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGP(var,type,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define LOGP(var,type,ptr) \ +do { \ + memcpy(ptr, var, sizeof(type)); \ + ptr += sizeof(type); \ +} while (0) + +/* + * set var to point at (char *)ptr, move ptr forward by size_round(len); + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGL(var,type,len,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += size_round(len * sizeof(type)); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define UNLOGL0(var,type,len,ptr,end) \ +do { \ + UNLOGL(var,type,len,ptr,end); \ + if ( *((char *)ptr - size_round(len) + len - 1) != '\0') \ + return -EFAULT; \ +} while (0) + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += size_round(len + 1); \ +} while (0) + +#endif /* _PORTALS_LIB_H */ diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am new file mode 100644 index 0000000..c61b084 --- /dev/null +++ b/lustre/portals/include/portals/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = base +include $(top_srcdir)/Rules + +pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h + diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h new file mode 100644 index 0000000..af4a2dc --- /dev/null +++ b/lustre/portals/include/portals/api-support.h @@ -0,0 +1,27 @@ +# define DEBUG_SUBSYSTEM S_PORTALS +# define PORTAL_DEBUG + +#ifndef __KERNEL__ +# include +# include +# include +# include + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include +# include +# include +#endif + +#include +#include +#include + +#include +#include +#include + +/* Hack for 2.4.18 macro name collision */ +#ifdef yield +#undef yield +#endif diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h new file mode 100644 index 0000000..a83749b --- /dev/null +++ b/lustre/portals/include/portals/api.h @@ -0,0 +1,159 @@ +#ifndef P30_API_H +#define P30_API_H + +#include + +#ifndef PTL_NO_WRAP +int PtlInit(void); +int PtlInitialized(void); +void PtlFini(void); + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in, + ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid, + ptl_handle_ni_t * interface_out); + +int PtlNIInitialized(ptl_interface_t); + +int PtlNIFini(ptl_handle_ni_t interface_in); + +#endif + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); + + +/* + * Network interfaces + */ + +#ifndef PTL_NO_WRAP +int PtlNIBarrier(ptl_handle_ni_t interface_in); +#endif + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out); + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out); + +#ifndef PTL_NO_WRAP +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); +#endif + + +/* + * PtlNIDebug: + * + * This is not an official Portals 3 API call. It is provided + * by the reference implementation to allow the maintainers an + * easy way to turn on and off debugging information in the + * library. Do not use it in code that is not intended for use + * with any version other than the portable reference library. + */ +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); + +/* + * PtlNIFailNid + * + * Not an official Portals 3 API call. It provides a way of simulating + * communications failures to all (nid == PTL_NID_ANY), or specific peers + * (via multiple calls), either until further notice (threshold == -1), or + * for a specific number of messages. Passing a threshold of zero, "heals" + * the given peer. + */ +int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); + + +/* + * Match entries + */ + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out); + +int PtlMEUnlink(ptl_handle_me_t current_in); + +int PtlMEUnlinkList(ptl_handle_me_t current_in); + +int PtlTblDump(ptl_handle_ni_t ni, int index_in); +int PtlMEDump(ptl_handle_me_t current_in); + + + +/* + * Memory descriptors + */ + +#ifndef PTL_NO_WRAP +int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out); + +int PtlMDUnlink(ptl_handle_md_t md_in); + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in); + +#endif + +/* These should not be called by users */ +int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in, + ptl_seq_t sequence_in); + + + + +/* + * Event queues + */ +#ifndef PTL_NO_WRAP + +/* These should be called by users */ +int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out); +int PtlEQFree(ptl_handle_eq_t eventq_in); + +int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); + +int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout); +#endif + +/* + * Access Control Table + */ +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); + + +/* + * Data movement + */ + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in); + + + +#endif diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h new file mode 100644 index 0000000..3c3b154 --- /dev/null +++ b/lustre/portals/include/portals/arg-blocks.h @@ -0,0 +1,265 @@ +#ifndef PTL_BLOCKS_H +#define PTL_BLOCKS_H + +/* + * blocks.h + * + * Argument block types for the Portals 3.0 library + * Generated by idl + * + */ + +#include + +/* put LIB_MAX_DISPATCH last here -- these must match the + assignements to the dispatch table in lib-p30/dispatch.c */ +#define PTL_GETID 1 +#define PTL_NISTATUS 2 +#define PTL_NIDIST 3 +#define PTL_NIDEBUG 4 +#define PTL_MEATTACH 5 +#define PTL_MEINSERT 6 +// #define PTL_MEPREPEND 7 +#define PTL_MEUNLINK 8 +#define PTL_TBLDUMP 9 +#define PTL_MEDUMP 10 +#define PTL_MDATTACH 11 +// #define PTL_MDINSERT 12 +#define PTL_MDBIND 13 +#define PTL_MDUPDATE 14 +#define PTL_MDUNLINK 15 +#define PTL_EQALLOC 16 +#define PTL_EQFREE 17 +#define PTL_ACENTRY 18 +#define PTL_PUT 19 +#define PTL_GET 20 +#define PTL_FAILNID 21 +#define LIB_MAX_DISPATCH 21 + +typedef struct PtlFailNid_in { + ptl_handle_ni_t interface; + ptl_nid_t nid; + unsigned int threshold; +} PtlFailNid_in; + +typedef struct PtlFailNid_out { + int rc; +} PtlFailNid_out; + +typedef struct PtlGetId_in { + ptl_handle_ni_t handle_in; +} PtlGetId_in; + +typedef struct PtlGetId_out { + int rc; + ptl_process_id_t id_out; +} PtlGetId_out; + +typedef struct PtlNIStatus_in { + ptl_handle_ni_t interface_in; + ptl_sr_index_t register_in; +} PtlNIStatus_in; + +typedef struct PtlNIStatus_out { + int rc; + ptl_sr_value_t status_out; +} PtlNIStatus_out; + + +typedef struct PtlNIDist_in { + ptl_handle_ni_t interface_in; + ptl_process_id_t process_in; +} PtlNIDist_in; + +typedef struct PtlNIDist_out { + int rc; + unsigned long distance_out; +} PtlNIDist_out; + + +typedef struct PtlNIDebug_in { + unsigned int mask_in; +} PtlNIDebug_in; + +typedef struct PtlNIDebug_out { + unsigned int rc; +} PtlNIDebug_out; + + +typedef struct PtlMEAttach_in { + ptl_handle_ni_t interface_in; + ptl_pt_index_t index_in; + ptl_ins_pos_t position_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; +} PtlMEAttach_in; + +typedef struct PtlMEAttach_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEAttach_out; + + +typedef struct PtlMEInsert_in { + ptl_handle_me_t current_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; + ptl_ins_pos_t position_in; +} PtlMEInsert_in; + +typedef struct PtlMEInsert_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEInsert_out; + +typedef struct PtlMEUnlink_in { + ptl_handle_me_t current_in; + ptl_unlink_t unlink_in; +} PtlMEUnlink_in; + +typedef struct PtlMEUnlink_out { + int rc; +} PtlMEUnlink_out; + + +typedef struct PtlTblDump_in { + int index_in; +} PtlTblDump_in; + +typedef struct PtlTblDump_out { + int rc; +} PtlTblDump_out; + + +typedef struct PtlMEDump_in { + ptl_handle_me_t current_in; +} PtlMEDump_in; + +typedef struct PtlMEDump_out { + int rc; +} PtlMEDump_out; + + +typedef struct PtlMDAttach_in { + ptl_handle_me_t me_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; + ptl_unlink_t unlink_in; +} PtlMDAttach_in; + +typedef struct PtlMDAttach_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDAttach_out; + + +typedef struct PtlMDBind_in { + ptl_handle_ni_t ni_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; +} PtlMDBind_in; + +typedef struct PtlMDBind_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDBind_out; + + +typedef struct PtlMDUpdate_internal_in { + ptl_handle_md_t md_in; + ptl_handle_eq_t testq_in; + ptl_seq_t sequence_in; + + ptl_md_t old_inout; + int old_inout_valid; + ptl_md_t new_inout; + int new_inout_valid; +} PtlMDUpdate_internal_in; + +typedef struct PtlMDUpdate_internal_out { + int rc; + ptl_md_t old_inout; + ptl_md_t new_inout; +} PtlMDUpdate_internal_out; + + +typedef struct PtlMDUnlink_in { + ptl_handle_md_t md_in; +} PtlMDUnlink_in; + +typedef struct PtlMDUnlink_out { + int rc; + ptl_md_t status_out; +} PtlMDUnlink_out; + + +typedef struct PtlEQAlloc_in { + ptl_handle_ni_t ni_in; + ptl_size_t count_in; + void *base_in; + int len_in; + int (*callback_in) (ptl_event_t * event); +} PtlEQAlloc_in; + +typedef struct PtlEQAlloc_out { + int rc; + ptl_handle_eq_t handle_out; +} PtlEQAlloc_out; + + +typedef struct PtlEQFree_in { + ptl_handle_eq_t eventq_in; +} PtlEQFree_in; + +typedef struct PtlEQFree_out { + int rc; +} PtlEQFree_out; + + +typedef struct PtlACEntry_in { + ptl_handle_ni_t ni_in; + ptl_ac_index_t index_in; + ptl_process_id_t match_id_in; + ptl_pt_index_t portal_in; +} PtlACEntry_in; + +typedef struct PtlACEntry_out { + int rc; +} PtlACEntry_out; + + +typedef struct PtlPut_in { + ptl_handle_md_t md_in; + ptl_ack_req_t ack_req_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; + ptl_hdr_data_t hdr_data_in; +} PtlPut_in; + +typedef struct PtlPut_out { + int rc; +} PtlPut_out; + + +typedef struct PtlGet_in { + ptl_handle_md_t md_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; +} PtlGet_in; + +typedef struct PtlGet_out { + int rc; +} PtlGet_out; + + +#endif diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h new file mode 100644 index 0000000..285f7e0 --- /dev/null +++ b/lustre/portals/include/portals/defines.h @@ -0,0 +1,117 @@ +/* +** $Id: defines.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +** +** This files contains definitions that are used throughout the cplant code. +*/ + +#ifndef CPLANT_H +#define CPLANT_H + +#define TITLE(fname,zmig) + + +/* +** TRUE and FALSE +*/ +#undef TRUE +#define TRUE (1) +#undef FALSE +#define FALSE (0) + + +/* +** Return codes from functions +*/ +#undef OK +#define OK (0) +#undef ERROR +#define ERROR (-1) + + + +/* +** The GCC macro for a safe max() that works on all types arithmetic types. +*/ +#ifndef MAX +#define MAX(a, b) (a) > (b) ? (a) : (b) +#endif /* MAX */ + +#ifndef MIN +#define MIN(a, b) (a) < (b) ? (a) : (b) +#endif /* MIN */ + +/* +** The rest is from the old qkdefs.h +*/ + +#ifndef __linux__ +#define __inline__ +#endif + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef __osf__ +#define PRIVATE static +#define PUBLIC +#endif + +#ifndef __osf__ +typedef unsigned char uchar; +#endif + +typedef char CHAR; +typedef unsigned char UCHAR; +typedef char INT8; +typedef unsigned char UINT8; +typedef short int INT16; +typedef unsigned short int UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long LONG32; +typedef unsigned long ULONG32; + +/* long may be 32 or 64, so we can't really append the size to the definition */ +typedef long LONG; +typedef unsigned long ULONG; + +#ifdef __alpha__ +typedef long int_t; +#ifndef __osf__ +typedef unsigned long uint_t; +#endif +#endif + +#ifdef __i386__ +typedef int int_t; +typedef unsigned int uint_t; +#endif + +typedef float FLOAT32; +typedef double FLOAT64; +typedef void VOID; +typedef INT32 BOOLEAN; +typedef void (*FCN_PTR)(void); + +#ifndef off64_t + +#if defined (__alpha__) || defined (__ia64__) +typedef long off64_t; +#else +typedef long long off64_t; +#endif + +#endif + +/* +** Process related typedefs +*/ +typedef UINT16 PID_TYPE; /* Type of Local process ID */ +typedef UINT16 NID_TYPE; /* Type of Physical node ID */ +typedef UINT16 GID_TYPE; /* Type of Group ID */ +typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */ + + + +#endif /* CPLANT_H */ diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h new file mode 100644 index 0000000..817936a --- /dev/null +++ b/lustre/portals/include/portals/errno.h @@ -0,0 +1,61 @@ +#ifndef _P30_ERRNO_H_ +#define _P30_ERRNO_H_ + +/* + * include/portals/errno.h + * + * Shared error number lists + */ + +/* If you change these, you must update the string table in api-errno.c */ +typedef enum { + PTL_OK = 0, + PTL_SEGV = 1, + + PTL_NOSPACE = 2, + PTL_INUSE = 3, + PTL_VAL_FAILED = 4, + + PTL_NAL_FAILED = 5, + PTL_NOINIT = 6, + PTL_INIT_DUP = 7, + PTL_INIT_INV = 8, + PTL_AC_INV_INDEX = 9, + + PTL_INV_ASIZE = 10, + PTL_INV_HANDLE = 11, + PTL_INV_MD = 12, + PTL_INV_ME = 13, + PTL_INV_NI = 14, +/* If you change these, you must update the string table in api-errno.c */ + PTL_ILL_MD = 15, + PTL_INV_PROC = 16, + PTL_INV_PSIZE = 17, + PTL_INV_PTINDEX = 18, + PTL_INV_REG = 19, + + PTL_INV_SR_INDX = 20, + PTL_ML_TOOLONG = 21, + PTL_ADDR_UNKNOWN = 22, + PTL_INV_EQ = 23, + PTL_EQ_DROPPED = 24, + + PTL_EQ_EMPTY = 25, + PTL_NOUPDATE = 26, + PTL_FAIL = 27, + PTL_NOT_IMPLEMENTED = 28, + PTL_NO_ACK = 29, + + PTL_IOV_TOO_MANY = 30, + PTL_IOV_TOO_SMALL = 31, + + PTL_EQ_INUSE = 32, + PTL_MD_INUSE = 33, + + PTL_MAX_ERRNO = 33 +} ptl_err_t; +/* If you change these, you must update the string table in api-errno.c */ + +extern const char *ptl_err_str[]; + +#endif diff --git a/lustre/portals/include/portals/internal.h b/lustre/portals/include/portals/internal.h new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h new file mode 100644 index 0000000..7e5d73d --- /dev/null +++ b/lustre/portals/include/portals/lib-dispatch.h @@ -0,0 +1,46 @@ +#ifndef PTL_DISPATCH_H +#define PTL_DISPATCH_H + +/* + * include/dispatch.h + * + * Dispatch table header and externs for remote side + * operations + * + * Generated by idl + * + */ + +#include +#include + +extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlACEntry(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); + +extern char *dispatch_name(int index); +#endif diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h new file mode 100644 index 0000000..4052c0c --- /dev/null +++ b/lustre/portals/include/portals/lib-nal.h @@ -0,0 +1,102 @@ +#ifndef _LIB_NAL_H_ +#define _LIB_NAL_H_ + +/* + * nal.h + * + * Library side headers that define the abstraction layer's + * responsibilities and interfaces + */ + +#include + +struct nal_cb_t { + /* + * Per interface portal table, access control table + * and NAL private data field; + */ + lib_ni_t ni; + void *nal_data; + /* + * send: Sends a preformatted header and user data to a + * specified remote process. + * Can overwrite iov. + */ + int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, size_t mlen); + /* + * recv: Receives an incoming message from a remote process + * Type of iov depends on options. Can overwrite iov. + */ + int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, size_t mlen, + size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, size_t mlen, + size_t rlen); + /* + * read: Reads a block of data from a specified user address + */ + int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); + + /* + * write: Writes a block of data into a specified user address + */ + int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); + + /* + * callback: Calls an event callback + */ + int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); + + /* + * malloc: Acquire a block of memory in a system independent + * fashion. + */ + void *(*cb_malloc) (nal_cb_t * nal, size_t len); + + void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to cb_unmap() is what cb_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); + + /* Turn interrupts off (begin of protected area) */ + void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); + + /* Turn interrupts on (end of protected area) */ + void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); + + /* + * Calculate a network "distance" to given node + */ + int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); +}; + +#endif diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h new file mode 100644 index 0000000..ec3393b --- /dev/null +++ b/lustre/portals/include/portals/lib-p30.h @@ -0,0 +1,383 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie); + + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h new file mode 100644 index 0000000..08ea118 --- /dev/null +++ b/lustre/portals/include/portals/lib-types.h @@ -0,0 +1,273 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * p30/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef _LIB_TYPES_H_ +#define _LIB_TYPES_H_ + +#include +#ifdef __KERNEL__ +# define PTL_USE_SLAB_CACHE +# include +# include +# include +#else +# include +#endif + +/* struct nal_cb_t is defined in lib-nal.h */ +typedef struct nal_cb_t nal_cb_t; + +typedef char *user_ptr; +typedef struct lib_msg_t lib_msg_t; +typedef struct lib_ptl_t lib_ptl_t; +typedef struct lib_ac_t lib_ac_t; +typedef struct lib_me_t lib_me_t; +typedef struct lib_md_t lib_md_t; +typedef struct lib_eq_t lib_eq_t; + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} ptl_handle_wire_t; + +/* byte-flip insensitive! */ +#define PTL_WIRE_HANDLE_NONE \ +((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) + +typedef enum { + PTL_MSG_ACK = 0, + PTL_MSG_PUT, + PTL_MSG_GET, + PTL_MSG_REPLY, + PTL_MSG_HELLO, +} ptl_msg_type_t; + +/* Each of these structs should start with an odd number of + * __u32, or the compiler could add its own padding and confuse + * everyone. + * + * Also, "length" needs to be at offset 28 of each struct. + */ +typedef struct ptl_ack { + ptl_size_t mlength; + ptl_handle_wire_t dst_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for acks) moving out RSN */ +} ptl_ack_t; + +typedef struct ptl_put { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t ack_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length moving out RSN */ + ptl_size_t offset; + ptl_hdr_data_t hdr_data; +} ptl_put_t; + +typedef struct ptl_get { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t return_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for gets) moving out RSN */ + ptl_size_t src_offset; + ptl_size_t return_offset; /* unused: going RSN */ + ptl_size_t sink_length; +} ptl_get_t; + +typedef struct ptl_reply { + __u32 unused1; /* unused fields going RSN */ + ptl_handle_wire_t dst_wmd; + ptl_size_t dst_offset; /* unused: going RSN */ + __u32 unused2; + ptl_size_t length; /* common length moving out RSN */ +} ptl_reply_t; + +typedef struct { + ptl_nid_t dest_nid; + ptl_nid_t src_nid; + ptl_pid_t dest_pid; + ptl_pid_t src_pid; + __u32 type; /* ptl_msg_type_t */ + union { + ptl_ack_t ack; + ptl_put_t put; + ptl_get_t get; + ptl_reply_t reply; + } msg; +} ptl_hdr_t; + +/* All length fields in individual unions at same offset */ +/* LASSERT for same in lib-move.c */ +#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length) + +/* A HELLO message contains the portals magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * PTL_MSG_HELLO in the type field. All other fields are zero (including + * PTL_HDR_LENGTH; i.e. no payload). + * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID, so that hosts with + * multiple IP interfaces can have a single NID. These NALs should exchange + * HELLO messages when a connection is first established. */ +typedef struct { + __u32 magic; /* PORTALS_PROTO_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} ptl_magicversion_t; + +#define PORTALS_PROTO_MAGIC 0xeebc0ded + +#define PORTALS_PROTO_VERSION_MAJOR 0 +#define PORTALS_PROTO_VERSION_MINOR 1 + +typedef struct { + long recv_count, recv_length, send_count, send_length, drop_count, + drop_length, msgs_alloc, msgs_max; +} lib_counters_t; + +/* temporary expedient: limit number of entries in discontiguous MDs */ +#if PTL_LARGE_MTU +# define PTL_MD_MAX_IOV 64 +#else +# define PTL_MD_MAX_IOV 16 +#endif + +struct lib_msg_t { + struct list_head msg_list; + int send_ack; + lib_md_t *md; + ptl_nid_t nid; + ptl_pid_t pid; + ptl_event_t ev; + ptl_handle_wire_t ack_wmd; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } msg_iov; +}; + +struct lib_ptl_t { + ptl_pt_index_t size; + struct list_head *tbl; +}; + +struct lib_ac_t { + int next_free; +}; + +typedef struct { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lib_handle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +struct lib_eq_t { + struct list_head eq_list; + lib_handle_t eq_lh; + ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + int eq_refcount; + int (*event_callback) (ptl_event_t * event); + void *eq_addrkey; +}; + +struct lib_me_t { + struct list_head me_list; + lib_handle_t me_lh; + ptl_process_id_t match_id; + ptl_match_bits_t match_bits, ignore_bits; + ptl_unlink_t unlink; + lib_md_t *md; +}; + +struct lib_md_t { + struct list_head md_list; + lib_handle_t md_lh; + lib_me_t *me; + user_ptr start; + ptl_size_t offset; + ptl_size_t length; + ptl_size_t max_size; + int threshold; + int pending; + ptl_unlink_t unlink; + unsigned int options; + unsigned int md_flags; + void *user_ptr; + lib_eq_t *eq; + void *md_addrkey; + unsigned int md_niov; /* # frags */ + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } md_iov; +}; + +#define PTL_MD_FLAG_UNLINK (1 << 0) +#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) + +#ifndef PTL_USE_SLAB_CACHE +typedef struct +{ + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lib_freelist_t; + +typedef struct +{ + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lib_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* stash in ni.ni_test_peers */ + ptl_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lib_test_peer_t; + +typedef struct { + int up; + int refcnt; + ptl_nid_t nid; + ptl_pid_t pid; + int num_nodes; + unsigned int debug; + lib_ptl_t tbl; + lib_ac_t ac; + lib_counters_t counters; + + int ni_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ni_next_object_cookie; /* cookie generator */ + __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ + + struct list_head ni_test_peers; + +#ifndef PTL_USE_SLAB_CACHE + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; +} lib_ni_t; + +#endif diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h new file mode 100644 index 0000000..41613ab --- /dev/null +++ b/lustre/portals/include/portals/list.h @@ -0,0 +1,246 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#endif + +#ifndef list_for_each_entry +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) +#endif + +#ifndef list_for_each_entry_safe +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: the &struct list_head to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = pos->member.next; \ + &pos->member != (head); \ + pos = list_entry(n, typeof(*pos), member), \ + n = pos->member.next) +#endif diff --git a/lustre/portals/include/portals/lltrace.h b/lustre/portals/include/portals/lltrace.h new file mode 100644 index 0000000..7d1b304 --- /dev/null +++ b/lustre/portals/include/portals/lltrace.h @@ -0,0 +1,175 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Compile with: + * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl + */ +#ifndef __LTRACE_H_ +#define __LTRACE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline int ltrace_write_file(char* fname) +{ + char* argv[3]; + + argv[0] = "debug_kernel"; + argv[1] = fname; + argv[2] = "1"; + + fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]); + + return jt_dbg_debug_kernel(3, argv); +} + +static inline int ltrace_clear() +{ + char* argv[1]; + + argv[0] = "clear"; + + fprintf(stderr, "[ptlctl] %s\n", argv[0]); + + return jt_dbg_clear_debug_buf(1, argv); +} + +static inline int ltrace_mark(int indent_level, char* text) +{ + char* argv[2]; + char mark_buf[PATH_MAX]; + + snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text); + + argv[0] = "mark"; + argv[1] = mark_buf; + return jt_dbg_mark_debug_buf(2, argv); +} + +static inline int ltrace_applymasks() +{ + char* argv[2]; + argv[0] = "list"; + argv[1] = "applymasks"; + + fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]); + + return jt_dbg_list(2, argv); +} + + +static inline int ltrace_filter(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "filter"; + argv[1] = subsys_or_mask; + return jt_dbg_filter(2, argv); +} + +static inline int ltrace_show(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "show"; + argv[1] = subsys_or_mask; + return jt_dbg_show(2, argv); +} + +static inline int ltrace_start() +{ + int rc = 0; + dbg_initialize(0, NULL); +#ifdef PORTALS_DEV_ID + rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); +#endif + ltrace_filter("class"); + ltrace_filter("socknal"); + ltrace_filter("qswnal"); + ltrace_filter("gmnal"); + ltrace_filter("portals"); + + ltrace_show("all_types"); + ltrace_filter("trace"); + ltrace_filter("malloc"); + ltrace_filter("net"); + ltrace_filter("page"); + ltrace_filter("other"); + ltrace_filter("info"); + ltrace_applymasks(); + + return rc; +} + + +static inline void ltrace_stop() +{ +#ifdef PORTALS_DEV_ID + unregister_ioc_dev(PORTALS_DEV_ID); +#endif +} + +static inline int not_uml() +{ + /* Return Values: + * 0 when run under UML + * 1 when run on host + * <0 when lookup failed + */ + struct stat buf; + int rc = stat("/dev/ubd", &buf); + rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; + if (rc<0) { + fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); + rc = 1; /* Assume host */ + } + return rc; +} + +#define LTRACE_MAX_NOB 256 +static inline void ltrace_add_processnames(char* fname) +{ + char cmdbuf[LTRACE_MAX_NOB]; + struct timeval tv; + struct timezone tz; + int nob; + int underuml = !not_uml(); + + gettimeofday(&tv, &tz); + + nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \""); + + /* Careful - these format strings need to match the CDEBUG + * formats in portals/linux/debug.c EXACTLY + */ + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ", + S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec); + + if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d | %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L); + } + else { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0L); + } + + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname); + system(cmdbuf); +} + +#endif diff --git a/lustre/portals/include/portals/myrnal.h b/lustre/portals/include/portals/myrnal.h new file mode 100644 index 0000000..6a61fd5 --- /dev/null +++ b/lustre/portals/include/portals/myrnal.h @@ -0,0 +1,27 @@ +/* +** $Id: myrnal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ + +#ifndef MYRNAL_H +#define MYRNAL_H + +#define MAX_ARGS_LEN (256) +#define MAX_RET_LEN (128) +#define MYRNAL_MAX_ACL_SIZE (64) +#define MYRNAL_MAX_PTL_SIZE (64) + +#define P3CMD (100) +#define P3SYSCALL (200) +#define P3REGISTER (300) + +enum { PTL_MLOCKALL }; + +typedef struct { + void *args; + size_t args_len; + void *ret; + size_t ret_len; + int p3cmd; +} myrnal_forward_t; + +#endif /* MYRNAL_H */ diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h new file mode 100644 index 0000000..c1c50ed --- /dev/null +++ b/lustre/portals/include/portals/nal.h @@ -0,0 +1,50 @@ +/* +** $Id: nal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ +#ifndef _NAL_H_ +#define _NAL_H_ + +/* + * p30/nal.h + * + * The API side NAL declarations + */ + +#include + +#ifdef yield +#undef yield +#endif + +typedef struct nal_t nal_t; + +struct nal_t { + ptl_ni_t ni; + int refct; + void *nal_data; + int *timeout; /* for libp30api users */ + int (*forward) (nal_t * nal, int index, /* Function ID */ + void *args, size_t arg_len, void *ret, size_t ret_len); + + int (*shutdown) (nal_t * nal, int interface); + + int (*validate) (nal_t * nal, void *base, size_t extent); + + void (*yield) (nal_t * nal); + + void (*lock) (nal_t * nal, unsigned long *flags); + + void (*unlock) (nal_t * nal, unsigned long *flags); +}; + +typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); + +extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); + +#ifndef PTL_IFACE_DEFAULT +#define PTL_IFACE_DEFAULT (PTL_IFACE_IP) +#endif + +#endif diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h new file mode 100644 index 0000000..1b837b4 --- /dev/null +++ b/lustre/portals/include/portals/nalids.h @@ -0,0 +1,4 @@ +#define PTL_IFACE_TCP 1 +#define PTL_IFACE_ER 2 +#define PTL_IFACE_SS 3 +#define PTL_IFACE_MAX 4 diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lustre/portals/include/portals/p30.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h new file mode 100644 index 0000000..34e5dc5 --- /dev/null +++ b/lustre/portals/include/portals/ppid.h @@ -0,0 +1,53 @@ +/* + * TITLE(ppid_h, "@(#) $Id: ppid.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $"); + */ + +#ifndef _INCppidh_ +#define _INCppidh_ + +#include "defines.h" +// #include "idtypes.h" + + +#define MAX_PPID 1000 /* this needs to fit into 16 bits so the + maximum value is 65535. having it "large" + can help w/ debugging process accounting + but there are reasons for making it + somewhat smaller than the maximum -- + requiring storage for arrays that index + on the ppid, eg... */ + +#define MAX_GID 1000 /* this needs to fit into 16 bits... */ + +#define MAX_FIXED_PPID 100 +#define MAX_FIXED_GID 100 +#define PPID_FLOATING MAX_FIXED_PPID+1 /* Floating area starts here */ +#define GID_FLOATING MAX_FIXED_GID+1 /* Floating area starts here */ +#define NUM_PTL_TASKS MAX_FIXED_PPID+80 /* Maximum no. portals tasks */ + +#define PPID_AUTO 0 + +/* Minimum PPID is 1 */ +#define PPID_BEBOPD 1 /* bebopd */ +#define GID_BEBOPD 1 /* bebopd */ + +#define PPID_PCT 2 /* pct */ +#define GID_PCT 2 /* pct */ + +#define PPID_FYOD 3 /* fyod */ +#define GID_FYOD 3 /* fyod */ + +#define PPID_GDBWRAP 11 /* portals proxy for gdb */ +#define GID_GDBWRAP 11 /* portals proxy for gdb */ + +#define PPID_TEST 15 /* for portals tests */ +#define GID_TEST 15 + +#define GID_YOD 5 /* yod */ +#define GID_PINGD 6 /* pingd */ +#define GID_BT 7 /* bt */ +#define GID_PTLTEST 8 /* ptltest */ +#define GID_CGDB 9 /* cgdb */ +#define GID_TVDSVR 10 /* start-tvdsvr */ + +#endif /* _INCppidh_ */ diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h new file mode 100644 index 0000000..fdaae69 --- /dev/null +++ b/lustre/portals/include/portals/ptlctl.h @@ -0,0 +1,74 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lustre/portals/include/portals/stringtab.h b/lustre/portals/include/portals/stringtab.h new file mode 100644 index 0000000..65ab189 --- /dev/null +++ b/lustre/portals/include/portals/stringtab.h @@ -0,0 +1,6 @@ +/* +** $Id: stringtab.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $ +*/ +/* + * stringtab.h + */ diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h new file mode 100644 index 0000000..d4038b6 --- /dev/null +++ b/lustre/portals/include/portals/types.h @@ -0,0 +1,157 @@ +#ifndef _P30_TYPES_H_ +#define _P30_TYPES_H_ + +#ifdef __linux__ +#include +#include +#else +#include +typedef u_int32_t __u32; +typedef u_int64_t __u64; +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) { return 0; } +#endif + +typedef __u64 ptl_nid_t; +typedef __u32 ptl_pid_t; +typedef __u32 ptl_pt_index_t; +typedef __u32 ptl_ac_index_t; +typedef __u64 ptl_match_bits_t; +typedef __u64 ptl_hdr_data_t; +typedef __u32 ptl_size_t; + +typedef struct { + unsigned long nal_idx; /* which network interface */ + __u64 cookie; /* which thing on that interface */ +} ptl_handle_any_t; + +typedef ptl_handle_any_t ptl_handle_ni_t; +typedef ptl_handle_any_t ptl_handle_eq_t; +typedef ptl_handle_any_t ptl_handle_md_t; +typedef ptl_handle_any_t ptl_handle_me_t; + +#define PTL_HANDLE_NONE \ +((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) +#define PTL_EQ_NONE PTL_HANDLE_NONE + +static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +{ + return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); +} + +#define PTL_NID_ANY ((ptl_nid_t) -1) +#define PTL_PID_ANY ((ptl_pid_t) -1) + +typedef struct { + ptl_nid_t nid; + ptl_pid_t pid; /* node id / process id */ +} ptl_process_id_t; + +typedef enum { + PTL_RETAIN = 0, + PTL_UNLINK +} ptl_unlink_t; + +typedef enum { + PTL_INS_BEFORE, + PTL_INS_AFTER +} ptl_ins_pos_t; + +typedef struct { + struct page *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; +} ptl_kiov_t; + +typedef struct { + void *start; + ptl_size_t length; + int threshold; + int max_size; + unsigned int options; + void *user_ptr; + ptl_handle_eq_t eventq; + unsigned int niov; +} ptl_md_t; + +/* Options for the MD structure */ +#define PTL_MD_OP_PUT (1 << 0) +#define PTL_MD_OP_GET (1 << 1) +#define PTL_MD_MANAGE_REMOTE (1 << 2) +#define PTL_MD_AUTO_UNLINK (1 << 3) +#define PTL_MD_TRUNCATE (1 << 4) +#define PTL_MD_ACK_DISABLE (1 << 5) +#define PTL_MD_IOV (1 << 6) +#define PTL_MD_MAX_SIZE (1 << 7) +#define PTL_MD_KIOV (1 << 8) + +#define PTL_MD_THRESH_INF (-1) + +typedef enum { + PTL_EVENT_GET, + PTL_EVENT_PUT, + PTL_EVENT_REPLY, + PTL_EVENT_ACK, + PTL_EVENT_SENT +} ptl_event_kind_t; + +#define PTL_SEQ_BASETYPE long +typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; +#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) + +typedef struct { + ptl_event_kind_t type; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength, mlength, offset; + ptl_handle_me_t unlinked_me; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + cycles_t arrival_time; + volatile ptl_seq_t sequence; +} ptl_event_t; + + +typedef enum { + PTL_ACK_REQ, + PTL_NOACK_REQ +} ptl_ack_req_t; + + +typedef struct { + volatile ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + ptl_handle_any_t cb_eq_handle; +} ptl_eq_t; + +typedef struct { + ptl_eq_t *eq; +} ptl_ni_t; + + +typedef struct { + int max_match_entries; /* max number of match entries */ + int max_mem_descriptors; /* max number of memory descriptors */ + int max_event_queues; /* max number of event queues */ + int max_atable_index; /* maximum access control list table index */ + int max_ptable_index; /* maximum portals table index */ +} ptl_ni_limits_t; + +/* + * Status registers + */ +typedef enum { + PTL_SR_DROP_COUNT, + PTL_SR_DROP_LENGTH, + PTL_SR_RECV_COUNT, + PTL_SR_RECV_LENGTH, + PTL_SR_SEND_COUNT, + PTL_SR_SEND_LENGTH, + PTL_SR_MSGS_MAX, +} ptl_sr_index_t; + +typedef int ptl_sr_value_t; + +#endif diff --git a/lustre/portals/knals/Makefile.am b/lustre/portals/knals/Makefile.am new file mode 100644 index 0000000..5c6085e --- /dev/null +++ b/lustre/portals/knals/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@ diff --git a/lustre/portals/knals/Makefile.mk b/lustre/portals/knals/Makefile.mk new file mode 100644 index 0000000..ce40a60 --- /dev/null +++ b/lustre/portals/knals/Makefile.mk @@ -0,0 +1,4 @@ +include ../Kernelenv + +obj-y = socknal/ +# more coming... \ No newline at end of file diff --git a/lustre/portals/knals/gmnal/Makefile.am b/lustre/portals/knals/gmnal/Makefile.am new file mode 100644 index 0000000..1dc6f4e --- /dev/null +++ b/lustre/portals/knals/gmnal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kgmnal +modulenet_DATA = kgmnal.o +EXTRA_PROGRAMS = kgmnal + +DEFS = +kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h diff --git a/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch new file mode 100644 index 0000000..23c80d9 --- /dev/null +++ b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch @@ -0,0 +1,43 @@ +diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c +--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002 ++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002 +@@ -30,6 +30,8 @@ + * + ************************************************************************/ + ++#define EXPORT_SYMTAB ++ + #include + #include + +@@ -4075,6 +4077,28 @@ + return 0; + } + ++EXPORT_SYMBOL(gm_blocking_receive_no_spin); ++EXPORT_SYMBOL(gm_close); ++EXPORT_SYMBOL(gm_dma_free); ++EXPORT_SYMBOL(gm_dma_malloc); ++EXPORT_SYMBOL(gm_drop_sends); ++EXPORT_SYMBOL(gm_finalize); ++EXPORT_SYMBOL(gm_get_node_id); ++EXPORT_SYMBOL(gm_init); ++EXPORT_SYMBOL(gm_initialize_alarm); ++EXPORT_SYMBOL(gm_max_node_id_in_use); ++EXPORT_SYMBOL(gm_min_size_for_length); ++EXPORT_SYMBOL(gm_num_receive_tokens); ++EXPORT_SYMBOL(gm_num_send_tokens); ++EXPORT_SYMBOL(gm_open); ++EXPORT_SYMBOL(gm_provide_receive_buffer); ++EXPORT_SYMBOL(gm_resume_sending); ++EXPORT_SYMBOL(gm_send_with_callback); ++EXPORT_SYMBOL(gm_set_acceptable_sizes); ++EXPORT_SYMBOL(gm_set_alarm); ++EXPORT_SYMBOL(gm_unknown); ++ ++ + /* + This file uses GM standard indentation. + +Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~ +Only in gm-1.5.2.1_Linux-cfs/: trace diff --git a/lustre/portals/knals/gmnal/gmnal.c b/lustre/portals/knals/gmnal/gmnal.c new file mode 100644 index 0000000..ceeea2a --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal.c @@ -0,0 +1,284 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "gmnal.h" + +ptl_handle_ni_t kgmnal_ni; +nal_t kgmnal_api; + +kgmnal_data_t kgmnal_data; +int gmnal_debug = 0; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: GMNAL, + kprni_arg: NULL, + kprni_fwd: kgmnal_fwd_packet, +}; + +static int kgmnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return PTL_OK; +} + +static void kgmnal_lock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void kgmnal_unlock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int kgmnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kgmnal_api); + return 0; +} + +static void kgmnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kgmnal_api); + + if (current->need_resched) + schedule(); + return; +} + +kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx) +{ + kgmnal_rx_t *conn; + + PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t)); + /* Check for out of mem here */ + if (conn==NULL) { + printk("kgm_add_recv: memory alloc failed\n"); + return NULL; + } + + list_add(&conn->krx_item,(struct list_head *)&data->kgm_list); + // conn->ndx=ndx; + // conn->len=conn->ptlhdr_copied=0; + // conn->loopback=0; + return conn; +} + +static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n", + kgmnal_data.kgm_nid, nnids); + lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size); + return &kgmnal_api; +} + +static void __exit +kgmnal_finalize(void) +{ + struct list_head *tmp; + + PORTAL_SYMBOL_UNREGISTER (kgmnal_ni); + PtlNIFini(kgmnal_ni); + lib_fini(&kgmnal_api); + + if (kgmnal_data.kgm_port) { + gm_close(kgmnal_data.kgm_port); + } + + /* FIXME: free dma buffers */ + /* FIXME: kill receiver thread */ + + PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS); + + list_for_each(tmp, &kgmnal_data.kgm_list) { + kgmnal_rx_t *conn; + conn = list_entry(tmp, kgmnal_rx_t, krx_item); + CDEBUG(D_IOCTL, "freeing conn %p\n",conn); + tmp = tmp->next; + list_del(&conn->krx_item); + PORTAL_FREE(conn, sizeof(*conn)); + } + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + +static int __init +kgmnal_initialize(void) +{ + int rc; + int ntok; + unsigned long sizemask; + unsigned int nid; + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kgmnal_api.forward = kgmnal_forward; + kgmnal_api.shutdown = kgmnal_shutdown; + kgmnal_api.yield = kgmnal_yield; + kgmnal_api.validate = NULL; /* our api validate is a NOOP */ + kgmnal_api.lock= kgmnal_lock; + kgmnal_api.unlock= kgmnal_unlock; + kgmnal_api.nal_data = &kgmnal_data; + + kgmnal_lib.nal_data = &kgmnal_data; + + memset(&kgmnal_data, 0, sizeof(kgmnal_data)); + + INIT_LIST_HEAD(&kgmnal_data.kgm_list); + kgmnal_data.kgm_cb = &kgmnal_lib; + + /* Allocate transmit descriptors */ + PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS); + if (kgmnal_data.kgm_trans==NULL) { + printk("kgmnal: init: failed to allocate transmit " + "descriptors\n"); + return -1; + } + memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS)); + + spin_lock_init(&kgmnal_data.kgm_dispatch_lock); + spin_lock_init(&kgmnal_data.kgm_update_lock); + spin_lock_init(&kgmnal_data.kgm_send_lock); + + /* Do the receiver and xmtr allocation */ + + rc = gm_init(); + if (rc != GM_SUCCESS) { + CERROR("gm_init failed: %d\n", rc); + return -1; + } + + rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME, + GM_API_VERSION_1_1); + if (rc != GM_SUCCESS) { + gm_finalize(); + kgmnal_data.kgm_port = NULL; + CERROR("gm_open failed: %d\n", rc); + return -1; + } + gm_get_node_id(kgmnal_data.kgm_port, &nid); + kgmnal_data.kgm_nid = nid; + /* Allocate 2 different sizes of buffers. For new, use half + the tokens for each. */ + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n", + ntok, MSG_LEN_LARGE); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_LARGE); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_LARGE, GM_LOW_PRIORITY); + } + + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n", + ntok, MSG_LEN_SMALL); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_SMALL); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + } + sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL); + CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n", + kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY, + sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0); + + /* Initialize Network Interface */ + rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + return (-ENOMEM); + } + + /* Start receiver thread */ + kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0); + + PORTAL_SYMBOL_REGISTER(kgmnal_ni); + + kgmnal_data.kgm_init = 1; + + return 0; +} + +MODULE_AUTHOR("Robert Read "); +MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1"); +MODULE_LICENSE("GPL"); + +module_init (kgmnal_initialize); +module_exit (kgmnal_finalize); + +EXPORT_SYMBOL (kgmnal_ni); diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h new file mode 100644 index 0000000..47e8c3c --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -0,0 +1,101 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _GMNAL_H +#define _GMNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_GMNAL + +#include +#include +#include + +#include + + +/* + * Myrinet GM NAL + */ +#define NPAGES_LARGE 16 +#define NPAGES_SMALL 1 +#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE +#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE +#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE)) +#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL)) + +#define TXMSGS 64 /* Number of Transmit Messages */ +#define ENVELOPES 8 /* Number of outstanding receive msgs */ + +#define KGM_PORT_NUM 3 +#define KGM_HOSTNAME "kgmnal" + + +typedef struct { + char *krx_buffer; + unsigned long krx_len; + unsigned int krx_size; + unsigned int krx_priority; + struct list_head krx_item; +} kgmnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + char *ktx_buffer; + size_t ktx_len; + unsigned long ktx_size; + int ktx_ndx; + unsigned int ktx_priority; + unsigned int ktx_tgt_node; + unsigned int ktx_tgt_port_id; +} kgmnal_tx_t; + + +typedef struct { + char kgm_init; + char kgm_shuttingdown; + struct gm_port *kgm_port; + struct list_head kgm_list; + ptl_nid_t kgm_nid; + nal_cb_t *kgm_cb; + struct kgm_trans *kgm_trans; + struct tq_struct kgm_ready_tq; + spinlock_t kgm_dispatch_lock; + spinlock_t kgm_update_lock; + spinlock_t kgm_send_lock; +} kgmnal_data_t; + +int kgm_init(kgmnal_data_t *kgm_data); +int kgmnal_recv_thread(void *); +int gm_return_mynid(void); +void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +extern kgmnal_data_t kgmnal_data; +extern nal_t kgmnal_api; +extern nal_cb_t kgmnal_lib; + +#endif /* _GMNAL_H */ + diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c new file mode 100644 index 0000000..3d4c86d --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -0,0 +1,517 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* TODO + * preallocate send buffers, store on list + * put receive buffers on queue, handle with receive threads + * use routing + */ + +#include "gmnal.h" + +extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int); + +static kgmnal_tx_t * +get_trans(void) +{ + kgmnal_tx_t *t; + PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t))); + return t; +} + +static void +put_trans(kgmnal_tx_t *t) +{ + PORTAL_FREE(t, sizeof(kgmnal_tx_t)); +} + +int +kgmnal_ispeer (ptl_nid_t nid) +{ + unsigned int gmnid = (unsigned int)nid; + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */ + gmnid < nnids); /* it's in this machine */ +} + +/* + * LIB functions follow + * + */ +static int +kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static int +kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static void * +kgmnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + +static void +kgmnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kgmnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kgmnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kgm_dispatch_lock,*flags); +} + + +static void +kgmnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags); +} + + +static int +kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +/* FIXME rmr: add rounting code here */ +static void +kgmnal_tx_done(kgmnal_tx_t *trans, int error) +{ + lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie); + + gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer); + + trans->ktx_buffer = NULL; + trans->ktx_len = 0; + + put_trans(trans); +} +static char * gm_error_strings[GM_NUM_STATUS_CODES] = { + [GM_SUCCESS] = "GM_SUCCESS", + [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT", + [GM_SEND_REJECTED] = "GM_SEND_REJECTED", + [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED", + [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE", + [GM_SEND_DROPPED] = "GM_SEND_DROPPED", + [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED", +}; + +inline char * get_error(int status) +{ + if (gm_error_strings[status] != NULL) + return gm_error_strings[status]; + else + return "Unknown error"; +} + +static void +kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status) +{ + CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status); +} + +static void +kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status) +{ + kgmnal_tx_t *ktx = (kgmnal_tx_t *)context; + int err = 0; + + LASSERT (p != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id); + + switch((int)status) { + case GM_SUCCESS: /* normal */ + break; + case GM_SEND_TIMED_OUT: /* application error */ + case GM_SEND_REJECTED: /* size of msg unacceptable */ + case GM_SEND_TARGET_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_TARGET_NODE_UNREACHABLE: + case GM_SEND_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_DROPPED: + CERROR("%s (%d):\n", get_error(status), status); + err = -EIO; + break; + default: + CERROR("Unknown status: %d\n", status); + err = -EIO; + break; + } + + kgmnal_tx_done(ktx, err); +} + +/* + */ + +static int +kgmnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t len) +{ + /* + * ipnal assumes that this is the private as passed to lib_dispatch.. + * so do we :/ + */ + kgmnal_tx_t *ktx=NULL; + int rc=0; + void * buf; + int buf_len = sizeof(ptl_hdr_t) + len; + int buf_size = 0; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + PROF_START(gmnal_send); + + + CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n", + len, iov, nid, KGM_PORT_NUM); + + /* ensure there is an available tx handle */ + + /* save transaction info to trans for later finalize and cleanup */ + ktx = get_trans(); + if (ktx == NULL) { + rc = -ENOMEM; + goto send_exit; + } + + /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce + header and data. + Also, memory must be dma'able or registered with GM. */ + + if (buf_len <= MSG_LEN_SMALL) { + buf_size = MSG_SIZE_SMALL; + } else if (buf_len <= MSG_LEN_LARGE) { + buf_size = MSG_SIZE_LARGE; + } else { + printk("kgmnal:request exceeds TX MTU size (%d).\n", + MSG_SIZE_LARGE); + rc = -1; + goto send_exit; + } + + buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len); + if (buf == NULL) { + rc = -ENOMEM; + goto send_exit; + } + memcpy(buf, hdr, sizeof(ptl_hdr_t)); + + if (len != 0) + lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), + options, niov, iov, len); + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + ktx->ktx_len = buf_len; + ktx->ktx_size = buf_size; + ktx->ktx_buffer = buf; + ktx->ktx_priority = GM_LOW_PRIORITY; + ktx->ktx_tgt_node = nid; + ktx->ktx_tgt_port_id = KGM_PORT_NUM; + + CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx " + "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM, + GM_LOW_PRIORITY); + + gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size, + buf_len, GM_LOW_PRIORITY, + nid, KGM_PORT_NUM, + kgmnal_txhandler, ktx); + + PROF_FINISH(gmnal_send); + send_exit: + return rc; +} +void +kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + CERROR ("forwarding not implemented\n"); +} + + +static inline void +kgmnal_requeue_rx(kgmnal_rx_t *krx) +{ + gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer, + krx->krx_size, krx->krx_priority); +} + +/* Process a received portals packet */ + +/* Receive Interrupt Handler */ +static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size, + void * buf, unsigned int pri) +{ + ptl_hdr_t *hdr = buf; + kgmnal_rx_t krx; + + CDEBUG(D_NET,"buf %p, len %ld\n", buf, len); + + if ( len < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (kgm->kgm_shuttingdown) + return; + CERROR("kgmnal: did not receive complete portal header, " + "len= %ld", len); + gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri); + return; + } + + /* might want to use seperate threads to handle receive */ + krx.krx_buffer = buf; + krx.krx_len = len; + krx.krx_size = size; + krx.krx_priority = pri; + + if ( hdr->dest_nid == kgmnal_lib.ni.nid ) { + PROF_START(lib_parse); + lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); + PROF_FINISH(lib_parse); + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx: target is " + "a peer", hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); + } else { + /* forward to gateway */ + CERROR("forwarding not implemented yet"); + kgmnal_requeue_rx(&krx); + } + + return; +} + + +static int kgmnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t mlen, + size_t rlen) +{ + kgmnal_rx_t *krx = private; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen); + + /* What was actually received must be >= what sender claims to + * have sent. This is an LASSERT, since lib-move doesn't + * check cb return code yet. */ + LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + LASSERT (mlen <= rlen); + + PROF_START(gmnal_recv); + + if(mlen != 0) { + PROF_START(memcpy); + lib_copy_buf2iov (options, niov, iov, + krx->krx_buffer + sizeof (ptl_hdr_t), mlen); + PROF_FINISH(memcpy); + } + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + kgmnal_requeue_rx(krx); + + PROF_FINISH(gmnal_recv); + + return rlen; +} + + +static void kgmnal_shutdown(void * none) +{ + CERROR("called\n"); + return; +} + +/* + * Set terminate and use alarm to wake up the recv thread. + */ +static void recv_shutdown(kgmnal_data_t *kgm) +{ + gm_alarm_t alarm; + + kgm->kgm_shuttingdown = 1; + gm_initialize_alarm(&alarm); + gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL); +} + +int kgmnal_end(kgmnal_data_t *kgm) +{ + + /* wait for sends to finish ? */ + /* remove receive buffers */ + /* shutdown receive thread */ + + recv_shutdown(kgm); + + return 0; +} + +/* Used only for the spinner */ +int kgmnal_recv_thread(void *arg) +{ + kgmnal_data_t *kgm = arg; + + LASSERT(kgm != NULL); + + kportal_daemonize("kgmnal_rx"); + + while(1) { + gm_recv_event_t *e; + int priority = GM_LOW_PRIORITY; + if (kgm->kgm_shuttingdown) + break; + + e = gm_blocking_receive_no_spin(kgm->kgm_port); + if (e == NULL) { + CERROR("gm_blocking_receive returned NULL\n"); + break; + } + + switch(gm_ntohc(e->recv.type)) { + case GM_HIGH_RECV_EVENT: + priority = GM_HIGH_PRIORITY; + /* fall through */ + case GM_RECV_EVENT: + kgmnal_rx(kgm, gm_ntohl(e->recv.length), + gm_ntohc(e->recv.size), + gm_ntohp(e->recv.buffer), priority); + break; + case GM_ALARM_EVENT: + CERROR("received alarm"); + gm_unknown(kgm->kgm_port, e); + break; + case GM_BAD_SEND_DETECTED_EVENT: /* ?? */ + CERROR("received bad send!\n"); + break; + default: + gm_unknown(kgm->kgm_port, e); + } + } + + CERROR("shuttting down.\n"); + return 0; +} + +nal_cb_t kgmnal_lib = { + nal_data: &kgmnal_data, /* NAL private data */ + cb_send: kgmnal_send, + cb_recv: kgmnal_recv, + cb_read: kgmnal_read, + cb_write: kgmnal_write, + cb_malloc: kgmnal_malloc, + cb_free: kgmnal_free, + cb_printf: kgmnal_printf, + cb_cli: kgmnal_cli, + cb_sti: kgmnal_sti, + cb_dist: kgmnal_dist +}; diff --git a/lustre/portals/knals/qswnal/Makefile.am b/lustre/portals/knals/qswnal/Makefile.am new file mode 100644 index 0000000..6759b96 --- /dev/null +++ b/lustre/portals/knals/qswnal/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kqswnal +modulenet_DATA = kqswnal.o +EXTRA_PROGRAMS = kqswnal + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c new file mode 100644 index 0000000..d64b7ad --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -0,0 +1,578 @@ +/* + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +ptl_handle_ni_t kqswnal_ni; +nal_t kqswnal_api; +kqswnal_data_t kqswnal_data; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: QSWNAL, + kprni_arg: NULL, + kprni_fwd: kqswnal_fwd_packet, +}; + + +static int +kqswnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return (PTL_OK); +} + +static void +kqswnal_lock (nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void +kqswnal_unlock(nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int +kqswnal_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "shutdown\n"); + + LASSERT (nal == &kqswnal_api); + return (0); +} + +static void +kqswnal_yield( nal_t *nal ) +{ + CDEBUG (D_NET, "yield\n"); + + if (current->need_resched) + schedule(); + return; +} + +static nal_t * +kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, + ptl_pid_t requested_pid) +{ + ptl_nid_t mynid = ep_nodeid (kqswnal_data.kqn_epdev); + int nnids = ep_numnodes (kqswnal_data.kqn_epdev); + + CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid,nnids); + + lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size); + + return (&kqswnal_api); +} + +void __exit +kqswnal_finalise (void) +{ + switch (kqswnal_data.kqn_init) + { + default: + LASSERT (0); + + case KQN_INIT_ALL: + PORTAL_SYMBOL_UNREGISTER (kqswnal_ni); + /* fall through */ + + case KQN_INIT_PTL: + PtlNIFini (kqswnal_ni); + lib_fini (&kqswnal_lib); + /* fall through */ + + case KQN_INIT_DATA: + break; + + case KQN_INIT_NOTHING: + return; + } + + /**********************************************************************/ + /* Make router stop her calling me and fail any more call-ins */ + kpr_shutdown (&kqswnal_data.kqn_router); + + /**********************************************************************/ + /* flag threads to terminate, wake them and wait for them to die */ + + kqswnal_data.kqn_shuttingdown = 1; + wake_up_all (&kqswnal_data.kqn_sched_waitq); + + while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { + CDEBUG(D_NET, "waiting for %d threads to terminate\n", + atomic_read (&kqswnal_data.kqn_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + /**********************************************************************/ + /* close elan comms */ + + if (kqswnal_data.kqn_eprx_small != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); + + if (kqswnal_data.kqn_eprx_large != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); + + if (kqswnal_data.kqn_eptx != NULL) + ep_free_large_xmtr (kqswnal_data.kqn_eptx); + + /**********************************************************************/ + /* No more threads. No more portals, router or comms callbacks! + * I control the horizontals and the verticals... + */ + + /**********************************************************************/ + /* Complete any blocked forwarding packets with error + */ + + while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + while (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + /**********************************************************************/ + /* Wait for router to complete any packets I sent her + */ + + kpr_deregister (&kqswnal_data.kqn_router); + + + /**********************************************************************/ + /* Unmap message buffers and free all descriptors and buffers + */ + + if (kqswnal_data.kqn_eprxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, 0, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle); + } + + if (kqswnal_data.kqn_eptxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, 0, + KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle); + } + + if (kqswnal_data.kqn_txds != NULL) + { + int i; + + for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) + { + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + } + + PORTAL_FREE(kqswnal_data.kqn_txds, + sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + } + + if (kqswnal_data.kqn_rxds != NULL) + { + int i; + int j; + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + for (j = 0; j < krx->krx_npages; j++) + if (krx->krx_pages[j] != NULL) + __free_page (krx->krx_pages[j]); + } + + PORTAL_FREE(kqswnal_data.kqn_rxds, + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGS_LARGE)); + } + + /* resets flags, pointers to NULL etc */ + memset(&kqswnal_data, 0, sizeof (kqswnal_data)); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); + + printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kqswnal_initialise (void) +{ + ELAN3_DMA_REQUEST dmareq; + int rc; + int i; + int elan_page_idx; + int pkmem = atomic_read(&portal_kmemory); + + LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); + + kqswnal_api.forward = kqswnal_forward; + kqswnal_api.shutdown = kqswnal_shutdown; + kqswnal_api.yield = kqswnal_yield; + kqswnal_api.validate = NULL; /* our api validate is a NOOP */ + kqswnal_api.lock = kqswnal_lock; + kqswnal_api.unlock = kqswnal_unlock; + kqswnal_api.nal_data = &kqswnal_data; + + kqswnal_lib.nal_data = &kqswnal_data; + + /* ensure all pointers NULL etc */ + memset (&kqswnal_data, 0, sizeof (kqswnal_data)); + + kqswnal_data.kqn_cb = &kqswnal_lib; + + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + spin_lock_init (&kqswnal_data.kqn_idletxd_lock); + init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); + + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); + + spin_lock_init (&kqswnal_data.kqn_sched_lock); + init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); + + spin_lock_init (&kqswnal_data.kqn_statelock); + + /* pointers/lists/locks initialised */ + kqswnal_data.kqn_init = KQN_INIT_DATA; + + /**********************************************************************/ + /* Find the first Elan device */ + + kqswnal_data.kqn_epdev = ep_device (0); + if (kqswnal_data.kqn_epdev == NULL) + { + CERROR ("Can't get elan device 0\n"); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the transmitter */ + + kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev); + if (kqswnal_data.kqn_eptx == NULL) + { + CERROR ("Can't allocate transmitter\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the receivers */ + + kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_SMALL, + KQSW_EP_ENVELOPES_SMALL); + if (kqswnal_data.kqn_eprx_small == NULL) + { + CERROR ("Can't install small msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_LARGE, + KQSW_EP_ENVELOPES_LARGE); + if (kqswnal_data.kqn_eprx_large == NULL) + { + CERROR ("Can't install large msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for transmit buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEREAD; + + rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState, + KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), + &dmareq, &kqswnal_data.kqn_eptxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for receive buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEWRITE; + + rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, + &dmareq, &kqswnal_data.kqn_eprxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Allocate/Initialise transmit descriptors */ + + PORTAL_ALLOC(kqswnal_data.kqn_txds, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + if (kqswnal_data.kqn_txds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* clear flags, null pointers etc */ + memset(kqswnal_data.kqn_txds, 0, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) + { + int premapped_pages; + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + int basepage = i * KQSW_NTXMSGPAGES; + + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + if (ktx->ktx_buffer == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* Map pre-allocated buffer NOW, to save latency on transmit */ + premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, + basepage, &ktx->ktx_ebuffer); + + ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ + ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ + + if (i < KQSW_NTXMSGS) + ktx->ktx_idle = &kqswnal_data.kqn_idletxds; + else + ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + + list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + } + + /**********************************************************************/ + /* Allocate/Initialise receive descriptors */ + + PORTAL_ALLOC (kqswnal_data.kqn_rxds, + sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); + if (kqswnal_data.kqn_rxds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); + + elan_page_idx = 0; + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + E3_Addr elanaddr; + int j; + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + if (i < KQSW_NRXMSGS_SMALL) + { + krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; + krx->krx_eprx = kqswnal_data.kqn_eprx_small; + } + else + { + krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; + krx->krx_eprx = kqswnal_data.kqn_eprx_large; + } + + LASSERT (krx->krx_npages > 0); + for (j = 0; j < krx->krx_npages; j++) + { + krx->krx_pages[j] = alloc_page (GFP_KERNEL); + if (krx->krx_pages[j] == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + LASSERT(page_address(krx->krx_pages[j]) != NULL); + + elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, + page_address(krx->krx_pages[j]), + PAGE_SIZE, elan_page_idx, + &elanaddr); + elan_page_idx++; + + if (j == 0) + krx->krx_elanaddr = elanaddr; + + /* NB we assume a contiguous */ + LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE); + } + } + LASSERT (elan_page_idx == + (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + + (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); + + /**********************************************************************/ + /* Network interface ready to initialise */ + + rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni); + if (rc != 0) + { + CERROR ("PtlNIInit failed %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_init = KQN_INIT_PTL; + + /**********************************************************************/ + /* Queue receives, now that it's OK to run their completion callbacks */ + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + /* NB this enqueue can allocate/sleep (attr == 0) */ + rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, + krx->krx_elanaddr, + krx->krx_npages * PAGE_SIZE, 0); + if (rc != 0) + { + CERROR ("failed ep_queue_receive %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + } + + /**********************************************************************/ + /* Spawn scheduling threads */ + for (i = 0; i < smp_num_cpus; i++) + { + rc = kqswnal_thread_start (kqswnal_scheduler, NULL); + if (rc != 0) + { + CERROR ("failed to spawn scheduling thread: %d\n", rc); + kqswnal_finalise (); + return (rc); + } + } + + /**********************************************************************/ + /* Connect to the router */ + rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); + CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); + + PORTAL_SYMBOL_REGISTER(kqswnal_ni); + kqswnal_data.kqn_init = KQN_INIT_ALL; + + printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d " + "(Routing %s, initial mem %d)\n", + ep_nodeid (kqswnal_data.kqn_epdev), + ep_numnodes (kqswnal_data.kqn_epdev), + kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + + +MODULE_AUTHOR("W. Marcus Miller "); +MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00"); +MODULE_LICENSE("GPL"); + +module_init (kqswnal_initialise); +module_exit (kqswnal_finalise); + +EXPORT_SYMBOL (kqswnal_ni); diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h new file mode 100644 index 0000000..657b02b --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -0,0 +1,249 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _QSWNAL_H +#define _QSWNAL_H +#define EXPORT_SYMTAB + +#ifdef PROPRIETARY_ELAN +# include +#else +# include +#endif + +#undef printf /* nasty QSW #define */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_QSWNAL + +#include +#include +#include + +#define KQSW_CHECKSUM 0 +#if KQSW_CHECKSUM +typedef unsigned long kqsw_csum_t; +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#else +#define KQSW_CSUM_SIZE 0 +#endif +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) + +/* + * Elan NAL + */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +/* NB small/large message sizes are GLOBAL constants */ + +/* + * Performance Tuning defines + * NB no mention of PAGE_SIZE for interoperability + */ +#if PTL_LARGE_MTU +# define KQSW_MAXPAYLOAD (256<<10) /* biggest message this NAL will cope with */ +#else +# define KQSW_MAXPAYLOAD (64<<10) /* biggest message this NAL will cope with */ +#endif + +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ + +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ + +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 128 /* # reserved transmit messages if can't block */ + +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ + +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ + +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ + +/* + * derived constants + */ + +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +/* The pre-allocated tx buffer (hdr + small payload) */ + +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +/* Reserve elan address space for pre-allocated and pre-mapped transmit + * buffer and a full payload too. Extra pages allow for page alignment */ + +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) + +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +/* biggest complete packet we can receive (or transmit) */ + + +typedef struct +{ + struct list_head krx_list; /* enqueue -> thread */ + EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RXD *krx_rxd; /* receive descriptor (for repost) */ + E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ + int krx_npages; /* # pages in receive buffer */ + int krx_nob; /* Number Of Bytes received into buffer */ + kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ + struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ + struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ +} kqswnal_rx_t; + +typedef struct +{ + struct list_head ktx_list; /* enqueue idle/delayed */ + struct list_head *ktx_idle; /* where to put when idle */ + char ktx_state; /* What I'm doing */ + uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ + int ktx_npages; /* pages reserved for mapping messages */ + int ktx_nmappedpages; /* # pages mapped for current message */ + EP_IOVEC ktx_iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ + int ktx_niov; /* # message frags */ + int ktx_port; /* destination ep port */ + ptl_nid_t ktx_nid; /* destination node */ + void *ktx_args[2]; /* completion passthru */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ +} kqswnal_tx_t; + +#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ + +typedef struct +{ + char kqn_init; /* what's been initialised */ + char kqn_shuttingdown; /* I'm trying to shut down */ + atomic_t kqn_nthreads; /* # threads still running */ + + kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + + struct list_head kqn_idletxds; /* transmit descriptors free to use */ + struct list_head kqn_nblk_idletxds; /* reserve of */ + spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ + wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ + struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ + + spinlock_t kqn_sched_lock; /* serialise packet schedulers */ + wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ + + struct list_head kqn_readyrxds; /* rxds full of data */ + struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_delayedtxds; /* delayed transmits */ + + spinlock_t kqn_statelock; /* cb_cli/cb_sti */ + nal_cb_t *kqn_cb; /* -> kqswnal_lib */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ +} kqswnal_data_t; + +/* kqn_init state */ +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 + +extern nal_cb_t kqswnal_lib; +extern nal_t kqswnal_api; +extern kqswnal_data_t kqswnal_data; + +extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); +extern void kqswnal_rxhandler(EP_RXD *rxd); +extern int kqswnal_scheduler (void *); +extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +static inline void +kqswnal_requeue_rx (kqswnal_rx_t *krx) +{ + ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE); +} + +static inline int +kqswnal_pages_spanned (void *base, int nob) +{ + unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; + unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; + + LASSERT (last_page >= first_page); /* can't wrap address space */ + return (last_page - first_page + 1); +} + +#if KQSW_CHECKSUM +static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) +{ + unsigned char *ptr = (unsigned char *)base; + + while (nob-- > 0) + sum += *ptr++; + + return (sum); +} +#endif + +#endif /* _QSWNAL_H */ diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c new file mode 100644 index 0000000..5979885 --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -0,0 +1,1242 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +atomic_t kqswnal_packets_launched; +atomic_t kqswnal_packets_transmitted; +atomic_t kqswnal_packets_received; + + +/* + * LIB functions follow + * + */ +static int +kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static int +kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static void * +kqswnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return (buf); +} + +static void +kqswnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + + +static void +kqswnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kqn_statelock, *flags); +} + + +static void +kqswnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kqn_statelock, *flags); +} + + +static int +kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + *dist = (nid == nal->ni.nid) ? 0 : 1; + return (0); +} + +int +kqswnal_ispeer (ptl_nid_t nid) +{ + unsigned int elanid = (unsigned int)nid; + + /* didn't lose high bits on conversion and it's in this machine? */ + return ((ptl_nid_t)elanid == nid && + elanid < ep_numnodes (kqswnal_data.kqn_epdev)); +} + +void +kqswnal_unmap_tx (kqswnal_tx_t *ktx) +{ + if (ktx->ktx_nmappedpages == 0) + return; + + CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", + ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages); + + LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); + LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= + kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); + + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_basepage, ktx->ktx_nmappedpages); + ktx->ktx_nmappedpages = 0; +} + +int +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + char *ptr; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = kiov->kiov_len; + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + /* each frag fits in a page */ + LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + + nmapped++; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + /* XXX this is really crap, but we'll have to kmap until + * EKC has a page (rather than vaddr) mapping interface */ + + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, page %d, %d total\n", + ktx, nfrags, ptr, fraglen, basepage, nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ptr, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + + kunmap (kiov->kiov_page); + + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage++; + kiov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +int +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = iov->iov_len; + long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + + nmapped += npages; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", + ktx, nfrags, iov->iov_base, fraglen, basepage, npages, + nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + iov->iov_base, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage += npages; + iov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +void +kqswnal_put_idle_tx (kqswnal_tx_t *ktx) +{ + kpr_fwd_desc_t *fwd = NULL; + struct list_head *idle = ktx->ktx_idle; + unsigned long flags; + + kqswnal_unmap_tx (ktx); /* release temporary mappings */ + ktx->ktx_state = KTX_IDLE; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_add (&ktx->ktx_list, idle); + + /* reserved for non-blocking tx */ + if (idle == &kqswnal_data.kqn_nblk_idletxds) { + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return; + } + + /* anything blocking for a tx descriptor? */ + if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ + { + CDEBUG(D_NET,"wakeup fwd\n"); + + fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + } + + if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq)) /* process? */ + { + /* local sender waiting for tx desc */ + CDEBUG(D_NET,"wakeup process\n"); + wake_up (&kqswnal_data.kqn_idletxd_waitq); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + if (fwd == NULL) + return; + + /* schedule packet for forwarding again */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +kqswnal_tx_t * +kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) +{ + unsigned long flags; + kqswnal_tx_t *ktx = NULL; + + for (;;) { + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kqswnal_data.kqn_idletxds)) { + ktx = list_entry (kqswnal_data.kqn_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* "normal" descriptor pool is empty */ + + if (fwd != NULL) { /* forwarded packet => queue for idle txd */ + CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); + list_add_tail (&fwd->kprfd_list, + &kqswnal_data.kqn_idletxd_fwdq); + break; + } + + /* doing a local transmit */ + if (!may_block) { + if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { + CERROR ("intr tx desc pool exhausted\n"); + break; + } + + ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* block for idle tx */ + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + CDEBUG (D_NET, "blocking for tx desc\n"); + wait_event (kqswnal_data.kqn_idletxd_waitq, + !list_empty (&kqswnal_data.kqn_idletxds)); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ + LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); + return (ktx); +} + +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +{ + switch (ktx->ktx_state) { + case KTX_FORWARDING: /* router asked me to forward this packet */ + kpr_fwd_done (&kqswnal_data.kqn_router, + (kpr_fwd_desc_t *)ktx->ktx_args[0], error); + break; + + case KTX_SENDING: /* packet sourced locally */ + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + (lib_msg_t *)ktx->ktx_args[1]); + break; + + default: + LASSERT (0); + } + + kqswnal_put_idle_tx (ktx); +} + +static void +kqswnal_txhandler(EP_TXD *txd, void *arg, int status) +{ + kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + + LASSERT (txd != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); + + if (status == EP_SUCCESS) + atomic_inc (&kqswnal_packets_transmitted); + + if (status != EP_SUCCESS) + { + CERROR ("kqswnal: Transmit failed with %d\n", status); + status = -EIO; + } + + kqswnal_tx_done (ktx, status); +} + +int +kqswnal_launch (kqswnal_tx_t *ktx) +{ + /* Don't block for transmit descriptor if we're in interrupt context */ + int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; + int rc = ep_transmit_large(kqswnal_data.kqn_eptx, ktx->ktx_nid, + ktx->ktx_port, attr, kqswnal_txhandler, + ktx, ktx->ktx_iov, ktx->ktx_niov); + long flags; + + if (rc == 0) + atomic_inc (&kqswnal_packets_launched); + + if (rc != ENOMEM) + return (rc); + + /* can't allocate ep txd => queue for later */ + + LASSERT (in_interrupt()); /* not called by thread (not looping) */ + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + return (0); +} + + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + default: + return (""); + } +} + +static void +kqswnal_cerror_hdr(ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + CERROR("P3 Header at %p of type %s\n", hdr, type_str); + CERROR(" From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid), + NTOH__u32(hdr->src_pid)); + CERROR(" To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid), + NTOH__u32(hdr->dest_pid)); + + switch (NTOH__u32(hdr->type)) { + case PTL_MSG_PUT: + CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + NTOH__u64 (hdr->msg.put.match_bits)); + CERROR(" Length %d, offset %d, hdr data "LPX64"\n", + NTOH__u32(PTL_HDR_LENGTH(hdr)), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + CERROR(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.get.ptl_index), + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CERROR(" Length %d, src offset %d\n", + NTOH__u32 (hdr->msg.get.sink_length), + NTOH__u32 (hdr->msg.get.src_offset)); + break; + + case PTL_MSG_ACK: + CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + NTOH__u32 (hdr->msg.ack.mlength)); + break; + + case PTL_MSG_REPLY: + CERROR(" dst md "LPX64"."LPX64", length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + NTOH__u32 (PTL_HDR_LENGTH(hdr))); + } + +} /* end of print_hdr() */ + +static int +kqswnal_sendmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + kqswnal_tx_t *ktx; + int rc; + ptl_nid_t gatewaynid; +#if KQSW_CHECKSUM + int i; + kqsw_csum_t csum; + int sumnob; +#endif + + /* NB, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 + " pid %u\n", payload_nob, payload_niov, nid, pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + if (payload_nob > KQSW_MAXPAYLOAD) { + CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", + payload_nob, KQSW_MAXPAYLOAD); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + if (!kqswnal_ispeer (nid)) { /* Can't send direct: find gateway? */ + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + if (!kqswnal_ispeer (gatewaynid)) { + CERROR("Bad gateway "LPX64" for "LPX64"\n", + gatewaynid, nid); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + nid = gatewaynid; + } + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (ktx == NULL) { + kqswnal_cerror_hdr (hdr); + lib_finalize (&kqswnal_lib, private, cookie); + } + + memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + +#if KQSW_CHECKSUM + csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); + memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); + for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + if (payload_kiov != NULL) { + ptl_kiov_t *kiov = &payload_kiov[i]; + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset; + + csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); + sumnob -= kiov->kiov_len; + } else { + struct iovec *iov = &payload_iov[i]; + + csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); + sumnob -= iov->iov_len; + } + } + memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); +#endif + + /* Set up first frag from pre-mapped buffer (it's at least the + * portals header) */ + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; + ktx->ktx_iov[0].Len = KQSW_HDR_SIZE; + ktx->ktx_niov = 1; + + if (payload_nob > 0) { /* got some payload (something more to do) */ + /* make a single contiguous message? */ + if (payload_nob <= KQSW_TX_MAXCONTIG) { + /* copy payload to ktx_buffer, immediately after hdr */ + if (payload_kiov != NULL) + lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_kiov, payload_nob); + else + lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_iov, payload_nob); + /* first frag includes payload */ + ktx->ktx_iov[0].Len += payload_nob; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov (ktx, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov (ktx, payload_nob, + payload_niov, payload_iov); + if (rc != 0) { + kqswnal_put_idle_tx (ktx); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + } + } + + ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_SENDING; /* => lib_finalize() on completion */ + ktx->ktx_args[0] = private; + ktx->ktx_args[1] = cookie; + + rc = kqswnal_launch (ktx); + if (rc != 0) { /* failed? */ + CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + return (0); +} + +static int +kqswnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kqswnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + +int kqswnal_fwd_copy_contig = 0; + +void +kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + int rc; + kqswnal_tx_t *ktx; + struct iovec *iov = fwd->kprfd_iov; + int niov = fwd->kprfd_niov; + int nob = fwd->kprfd_nob; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + /* The router wants this NAL to forward a packet */ + CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + fwd, nid, niov, nob); + + LASSERT (niov > 0); + + ktx = kqswnal_get_idle_tx (fwd, FALSE); + if (ktx == NULL) /* can't get txd right now */ + return; /* fwd will be scheduled when tx desc freed */ + + if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + nid = fwd->kprfd_target_nid; /* target is final dest */ + + if (!kqswnal_ispeer (nid)) { + CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); + rc = -EHOSTUNREACH; + goto failed; + } + + if (nob > KQSW_NRXMSGBYTES_LARGE) { + CERROR ("Can't forward [%p] to "LPX64 + ": size %d bigger than max packet size %ld\n", + fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); + rc = -EMSGSIZE; + goto failed; + } + + if ((kqswnal_fwd_copy_contig || niov > 1) && + nob <= KQSW_TX_BUFFER_SIZE) + { + /* send from ktx's pre-allocated/mapped contiguous buffer? */ + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ + ktx->ktx_iov[0].Len = nob; + ktx->ktx_niov = 1; + } + else + { + /* zero copy */ + ktx->ktx_niov = 0; /* no frags mapped yet */ + rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + if (rc != 0) + goto failed; + } + + ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_FORWARDING; /* kpr_put_packet() on completion */ + ktx->ktx_args[0] = fwd; + + rc = kqswnal_launch (ktx); + if (rc == 0) + return; + + failed: + LASSERT (rc != 0); + CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); + + kqswnal_put_idle_tx (ktx); + /* complete now (with failure) */ + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; + + /* The router has finished forwarding this packet */ + + if (error != 0) + { + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); + } + + kqswnal_requeue_rx (krx); +} + +void +kqswnal_rx (kqswnal_rx_t *krx) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int nob; + int niov; + + if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ + /* NB krx requeued when lib_parse() calls back kqswnal_recv */ + lib_parse (&kqswnal_lib, hdr, krx); + return; + } + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + if (kqswnal_ispeer (dest_nid)) /* should have gone direct to peer */ + { + CERROR("dropping packet from "LPX64" for "LPX64 + ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); + kqswnal_requeue_rx (krx); + return; + } + + /* NB forwarding may destroy iov; rebuild every time */ + for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) + { + LASSERT (niov < krx->krx_npages); + krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); + krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + } + + kpr_fwd_init (&krx->krx_fwd, dest_nid, + krx->krx_nob, niov, krx->krx_iov, + kqswnal_fwd_callback, krx); + + kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); +} + +/* Receive Interrupt Handler: posts to schedulers */ +void +kqswnal_rxhandler(EP_RXD *rxd) +{ + long flags; + int nob = ep_rxd_len (rxd); + int status = ep_rxd_status (rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); + + CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", + rxd, krx, nob, status); + + LASSERT (krx != NULL); + + krx->krx_rxd = rxd; + krx->krx_nob = nob; + + /* must receive a whole header to be able to parse */ + if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) + { + /* receives complete with failure when receiver is removed */ + if (kqswnal_data.kqn_shuttingdown) + return; + + CERROR("receive status failed with status %d nob %d\n", + ep_rxd_status(rxd), nob); + kqswnal_requeue_rx (krx); + return; + } + + atomic_inc (&kqswnal_packets_received); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +#if KQSW_CHECKSUM +void +kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 + ", dpid %d, spid %d, type %d\n", + ishdr ? "Header" : "Payload", krx, + NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid) + NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid), + NTOH__u32(hdr->type)); + + switch (NTOH__u32 (hdr->type)) + { + case PTL_MSG_ACK: + CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 + " len %u\n", + NTOH__u32(hdr->msg.ack.mlength), + hdr->msg.ack.dst_wmd.handle_cookie, + hdr->msg.ack.dst_wmd.handle_idx, + NTOH__u64(hdr->msg.ack.match_bits), + NTOH__u32(hdr->msg.ack.length)); + break; + case PTL_MSG_PUT: + CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 + " len %u off %u data "LPX64"\n", + NTOH__u32(hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.handle_cookie, + hdr->msg.put.ack_wmd.handle_idx, + NTOH__u64(hdr->msg.put.match_bits), + NTOH__u32(hdr->msg.put.length), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + case PTL_MSG_GET: + CERROR ("GET: <>\n"); + break; + case PTL_MSG_REPLY: + CERROR ("REPLY: <>\n"); + break; + default: + CERROR ("TYPE?: <>\n"); + } +} +#endif + +static int +kqswnal_recvmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + int page; + char *page_ptr; + int page_nob; + char *iov_ptr; + int iov_nob; + int frag; +#if KQSW_CHECKSUM + kqsw_csum_t senders_csum; + kqsw_csum_t payload_csum = 0; + kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), + sizeof(ptl_hdr_t)); + size_t csum_len = mlen; + int csum_frags = 0; + int csum_nob = 0; + static atomic_t csum_counter; + int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; + + atomic_inc (&csum_counter); + + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + if (senders_csum != hdr_csum) + kqswnal_csum_error (krx, 1); +#endif + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + + /* What was actually received must be >= payload. + * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ + LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + LASSERT (mlen <= rlen); + + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + if (mlen != 0) + { + page = 0; + page_ptr = ((char *) page_address(krx->krx_pages[0])) + + KQSW_HDR_SIZE; + page_nob = PAGE_SIZE - KQSW_HDR_SIZE; + + LASSERT (niov > 0); + if (kiov != NULL) { + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + + for (;;) + { + /* We expect the iov to exactly match mlen */ + LASSERT (iov_nob <= mlen); + + frag = MIN (page_nob, iov_nob); + memcpy (iov_ptr, page_ptr, frag); +#if KQSW_CHECKSUM + payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); + csum_nob += frag; + csum_frags++; +#endif + mlen -= frag; + if (mlen == 0) + break; + + page_nob -= frag; + if (page_nob != 0) + page_ptr += frag; + else + { + page++; + LASSERT (page < krx->krx_npages); + page_ptr = page_address(krx->krx_pages[page]); + page_nob = PAGE_SIZE; + } + + iov_nob -= frag; + if (iov_nob != 0) + iov_ptr += frag; + else if (kiov != NULL) { + kunmap (kiov->kiov_page); + kiov++; + niov--; + LASSERT (niov > 0); + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov++; + niov--; + LASSERT (niov > 0); + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + } + + if (kiov != NULL) + kunmap (kiov->kiov_page); + } + +#if KQSW_CHECKSUM + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + + if (csum_len != rlen) + CERROR("Unable to checksum data in user's buffer\n"); + else if (senders_csum != payload_csum) + kqswnal_csum_error (krx, 0); + + if (csum_verbose) + CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " + "csum_nob %d\n", + hdr_csum, payload_csum, csum_frags, csum_nob); +#endif + lib_finalize(nal, private, cookie); + + kqswnal_requeue_rx (krx); + + return (rlen); +} + +static int +kqswnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + +static int +kqswnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + +int +kqswnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kqswnal_data.kqn_nthreads); + return (0); +} + +void +kqswnal_thread_fini (void) +{ + atomic_dec (&kqswnal_data.kqn_nthreads); +} + +int +kqswnal_scheduler (void *arg) +{ + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; + kpr_fwd_desc_t *fwd; + long flags; + int rc; + int counter = 0; + int did_something; + + kportal_daemonize ("kqswnal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + while (!kqswnal_data.kqn_shuttingdown) + { + did_something = FALSE; + + if (!list_empty (&kqswnal_data.kqn_readyrxds)) + { + krx = list_entry(kqswnal_data.kqn_readyrxds.next, + kqswnal_rx_t, krx_list); + list_del (&krx->krx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + kqswnal_rx (krx); + + did_something = TRUE; + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) + { + ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + rc = kqswnal_launch (ktx); + if (rc != 0) /* failed: ktx_nid down? */ + { + CERROR("Failed delayed transmit to "LPX64 + ": %d\n", ktx->ktx_nid, rc); + kqswnal_tx_done (ktx, rc); + } + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_fwd_packet (NULL, fwd); + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == KQSW_RESCHED) { + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, + kqswnal_data.kqn_shuttingdown || + !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_delayedtxds) || + !list_empty(&kqswnal_data.kqn_delayedfwds)); + LASSERT (rc == 0); + } else if (current->need_resched) + schedule (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_thread_fini (); + return (0); +} + +nal_cb_t kqswnal_lib = +{ + nal_data: &kqswnal_data, /* NAL private data */ + cb_send: kqswnal_send, + cb_send_pages: kqswnal_send_pages, + cb_recv: kqswnal_recv, + cb_recv_pages: kqswnal_recv_pages, + cb_read: kqswnal_read, + cb_write: kqswnal_write, + cb_malloc: kqswnal_malloc, + cb_free: kqswnal_free, + cb_printf: kqswnal_printf, + cb_cli: kqswnal_cli, + cb_sti: kqswnal_sti, + cb_dist: kqswnal_dist +}; diff --git a/lustre/portals/knals/scimacnal/Makefile.am b/lustre/portals/knals/scimacnal/Makefile.am new file mode 100644 index 0000000..6da31f0 --- /dev/null +++ b/lustre/portals/knals/scimacnal/Makefile.am @@ -0,0 +1,11 @@ +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kscimacnal +modulenet_DATA = kscimacnal.o +EXTRA_PROGRAMS = kscimacnal + +DEFS = +kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h diff --git a/lustre/portals/knals/scimacnal/README.scimacnal b/lustre/portals/knals/scimacnal/README.scimacnal new file mode 100644 index 0000000..d4c6a49 --- /dev/null +++ b/lustre/portals/knals/scimacnal/README.scimacnal @@ -0,0 +1,14 @@ + +scimacnal - A NAL for the Scali ScaMAC midlayer. + +The ScaMAC midlayer is a simplified API to the SCI high performance +interconnect. + +In order to use this NAL you'll need to tune scimac to use larger buffers. +See scimac.conf in this directory for an example. + +Overall performance and stability isn't great but this can be attributed +to the scimac driver which apparently is in need of some development. + +TODO: +Routing isn't yet implemented. diff --git a/lustre/portals/knals/scimacnal/scimac.conf b/lustre/portals/knals/scimacnal/scimac.conf new file mode 100644 index 0000000..bfb6d02 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimac.conf @@ -0,0 +1,35 @@ +# Configuration file for the scimac driver - lustre friendly settings +# + +# The maximal number of message headers to use in the system. +scimac_max_no_hdrs = 32 + +# The maximal number of eager buffers to use in the system. +scimac_max_no_ebufs = 8 + +# The maximal size in bytes of each eager buffer. +scimac_max_ebuf_size = 65536 + +# Enable use of a kernel thread to defer reception of packets. +# Default is to use a tasklet (sw interrupt). +scimac_use_ulevel_recv = 1 + +# The maximal number of packets queued for transfer per path at any one time. +scimac_max_send_queuelen = 2000 + +# The packet retransmit time in milliseconds. +# The time elapsed since a packet was attempted sent until the packet is resent. +scimac_pkt_rexmit_time = 200 + +# The packet's maximal retransmit time in milliseconds. +# The total time that a packet will be attempted sent before it is dropped. +scimac_max_rexmit_time = 5000 + +# The lowest valid node identifier in the system. +scimac_min_nodeid_number = 0x100 + +# The largest valid node identifier in the system. +scimac_max_nodeid_number = 0xff00 + +# The incremental nodeid step in the system. +scimac_nodeid_increment = 0x100 diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c new file mode 100644 index 0000000..1066d69 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal.c @@ -0,0 +1,219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * Based on gmnal, which is based on ksocknal and qswnal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#include "scimacnal.h" + +ptl_handle_ni_t kscimacnal_ni; +nal_t kscimacnal_api; + +kscimacnal_data_t kscimacnal_data; + +kpr_nal_interface_t kscimacnal_router_interface = { + kprni_nalid: SCIMACNAL, + kprni_arg: NULL, + kprni_fwd: kscimacnal_fwd_packet, +}; + + +static int kscimacnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */ + return PTL_OK; +} + + +static void kscimacnal_lock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + + +static void kscimacnal_unlock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + + +static int kscimacnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kscimacnal_api); + return 0; +} + + +static void kscimacnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kscimacnal_api); + + if (current->need_resched) + schedule(); + return; +} + + +static nal_t *kscimacnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + int nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */ + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids); + lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); + return &kscimacnal_api; +} + + +/* Called by kernel at module unload time */ +static void __exit +kscimacnal_finalize(void) +{ + /* FIXME: How should the shutdown procedure really look? */ + kscimacnal_data.ksci_shuttingdown=1; + + PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni); + + PtlNIFini(kscimacnal_ni); + lib_fini(&kscimacnal_lib); + + mac_finish(kscimacnal_data.ksci_machandle); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + + +/* Called by kernel at module insertion time */ +static int __init +kscimacnal_initialize(void) +{ + int rc; + unsigned long nid=0; + mac_handle_t *machandle = NULL; + + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kscimacnal_api.forward = kscimacnal_forward; + kscimacnal_api.shutdown = kscimacnal_shutdown; + kscimacnal_api.yield = kscimacnal_yield; + kscimacnal_api.validate = NULL; /* our api validate is a NOOP */ + kscimacnal_api.lock= kscimacnal_lock; + kscimacnal_api.unlock= kscimacnal_unlock; + kscimacnal_api.nal_data = &kscimacnal_data; + + kscimacnal_lib.nal_data = &kscimacnal_data; + + memset(&kscimacnal_data, 0, sizeof(kscimacnal_data)); + + kscimacnal_data.ksci_cb = &kscimacnal_lib; + + /* We're not using this, but cli/sti callbacks does... ??? */ + spin_lock_init(&kscimacnal_data.ksci_dispatch_lock); + + /* FIXME: We only support one adapter for now */ + machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx, + &kscimacnal_data); + + if(!machandle) { + CERROR("mac_init() failed\n"); + return -1; + } + + kscimacnal_data.ksci_machandle = machandle; + + /* Make sure the scimac MTU is tuned */ + if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) { + CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n", + mac_get_mtusize(machandle), SCIMACNAL_MTU); + CERROR("Consult README.scimacnal for more information\n"); + mac_finish(machandle); + return -1; + } + + /* Get the node ID */ + /* mac_get_physaddrlen() is a function instead of define, sigh */ + LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid)); + if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) { + CERROR("mac_get_physaddr() failed\n"); + mac_finish(machandle); + return -1; + } + nid = ntohl(nid); + kscimacnal_data.ksci_nid = nid; + + + /* Initialize Network Interface */ + /* FIXME: What do the magic numbers mean? Documentation anyone? */ + rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + mac_finish(machandle); + return (-ENOMEM); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); + + /* We're done now, it's OK for the RX callback to do stuff */ + kscimacnal_data.ksci_init = 1; + + return 0; +} + + +MODULE_AUTHOR("Niklas Edmundsson "); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_LICENSE("GPL"); + +module_init (kscimacnal_initialize); +module_exit (kscimacnal_finalize); + +EXPORT_SYMBOL(kscimacnal_ni); diff --git a/lustre/portals/knals/scimacnal/scimacnal.h b/lustre/portals/knals/scimacnal/scimacnal.h new file mode 100644 index 0000000..1ff180e --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal.h @@ -0,0 +1,85 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + */ + + +#ifndef _SCIMACNAL_H +#define _SCIMACNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For PAGE_SIZE */ + +#define DEBUG_SUBSYSTEM S_UNDEFINED + +#include +#include +#include + +#include + +#ifndef MAC_SAPID_LUSTRE +#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 +#endif /* MAC_SAPID_LUSTRE */ + +#define SCIMACNAL_MTU 65536 +/* FIXME: What is really the MTU of lustre? */ +#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#endif + +typedef struct { + mac_handle_t *handle; + mac_mblk_t *msg; + mac_msg_type_t type; + void *userdata; +} kscimacnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + ptl_hdr_t ktx_hdr; +} kscimacnal_tx_t; + + +typedef struct { + char ksci_init; + char ksci_shuttingdown; + ptl_nid_t ksci_nid; + nal_cb_t *ksci_cb; + spinlock_t ksci_dispatch_lock; + mac_handle_t *ksci_machandle; +} kscimacnal_data_t; + +extern kscimacnal_data_t kscimacnal_data; +extern nal_t kscimacnal_api; +extern nal_cb_t kscimacnal_lib; + +void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata); + + +#endif /* _SCIMACNAL_H */ diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c new file mode 100644 index 0000000..7e4a2e8 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal_cb.c @@ -0,0 +1,468 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "scimacnal.h" + +static int +kscimacnal_read (nal_cb_t *nal, void *private, + void *dst_addr, user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static int +kscimacnal_write(nal_cb_t *nal, void *private, + user_ptr dst_addr, void *src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static void * +kscimacnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + + +static void +kscimacnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + + +static void +kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kscimacnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->ksci_dispatch_lock,*flags); +} + + +static void +kscimacnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags); +} + + +static int +kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* FIXME: Network distance has a meaning, but is there no easy + * way to figure it out (depends on routing) */ + + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + + +static +char * get_mac_error(mac_status_t status) +{ + switch(status) { + case MAC_MSG_STAT_OK: + return "MAC_MSG_STAT_OK"; + case MAC_MSG_STAT_FREED: + return "MAC_MSG_STAT_FREED"; + case MAC_MSG_STAT_ABORTED: + return "MAC_MSG_STAT_ABORTED"; + case MAC_MSG_STAT_TIMEDOUT: + return "MAC_MSG_STAT_TIMEDOUT"; + case MAC_MSG_STAT_NODEUNREACH: + return "MAC_MSG_STAT_NODEUNREACH"; + case MAC_MSG_STAT_NETDOWN: + return "MAC_MSG_STAT_NETDOWN"; + case MAC_MSG_STAT_RESET: + return "MAC_MSG_STAT_RESET"; + case MAC_MSG_STAT_INITFAILED: + return "MAC_MSG_STAT_INITFAILED"; + case MAC_MSG_STAT_SYNCFAILED: + return "MAC_MSG_STAT_SYNCFAILED"; + case MAC_MSG_STAT_BADPROTO: + return "MAC_MSG_STAT_BADPROTO"; + case MAC_MSG_STAT_NOBUFSPACE: + return "MAC_MSG_STAT_NOBUFSPACE"; + case MAC_MSG_STAT_CONGESTION: + return "MAC_MSG_STAT_CONGESTION"; + case MAC_MSG_STAT_OTHER: + return "MAC_MSG_STAT_OTHER"; + default: + return "Unknown error"; + } +} + + +/* FIXME add routing code here ? */ + +/* Called by ScaMac when transmission is complete (ie. message is released) */ +static void +kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) +{ + kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; + int err=0; + + LASSERT (ktx != NULL); + + /* Euh, there is no feedback when transmission fails?! */ + switch(status) { + case MAC_MSG_STAT_OK: /* normal */ + break; + default: + CERROR("%s (%d):\n", get_mac_error(status), status); + err = -EIO; + break; + } + + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); +} + + +/* Called by portals when it wants to send a message. + * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ +static int +kscimacnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_len) +{ + kscimacnal_tx_t *ktx=NULL; + kscimacnal_data_t *ksci = nal->nal_data; + int rc=0; + int buf_len = sizeof(ptl_hdr_t) + payload_len; + mac_mblk_t *msg=NULL, *lastblk, *newblk; + unsigned long physaddr; + + + CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, nid, payload_niov); + + LASSERT(ksci != NULL); + + LASSERT(hdr != NULL); + + /* Do real check if we can send this */ + if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { + CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", + mac_get_mtusize(ksci->ksci_machandle)); + return -EINVAL; + } + + + /* save transaction info for later finalize and cleanup */ + PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); + if (!ktx) { + return -ENOMEM; + } + + /* *SIGH* hdr is a stack variable in the calling function, so we + * need to copy it to a buffer. Zerocopy magic (or is it just + * deferred memcpy?) is annoying sometimes. */ + memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t)); + + /* First, put the header in the main message mblk */ + msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t), + kscimacnal_txrelease, ktx); + if (!msg) { + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(msg, sizeof(ptl_hdr_t)); + lastblk=msg; + + /* Allocate additional mblks for each iov as needed. + * Essentially lib_copy_iov2buf with a twist or two */ + while (payload_len > 0) + { + ptl_size_t nob; + + LASSERT (payload_niov > 0); + + nob = MIN (payload_iov->iov_len, payload_len); + + /* We don't need a callback on the additional mblks, since + * all release callbacks seems to be called when the entire + * message has been sent */ + newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); + if(!newblk) { + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(newblk, nob); + mac_link_mblk(lastblk, newblk); + lastblk=newblk; + + payload_len -= nob; + payload_niov--; + payload_iov++; + } + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + + CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid); + + physaddr = htonl(nid); + + if((rc=mac_send(ksci->ksci_machandle, msg, + (mac_physaddr_t *) &physaddr))) { + CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return rc; + } + + return 0; +} + + +void +kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + + +/* Process a received portals packet */ +/* Called by the ScaMac RX thread when a packet is received */ +void +kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, + void *userdata) +{ + ptl_hdr_t *hdr = NULL; + kscimacnal_rx_t krx; + mac_size_t size; + kscimacnal_data_t *ksci = userdata; + + LASSERT(ksci != NULL); + + if ( !ksci->ksci_init || ksci->ksci_shuttingdown || + type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) { + /* We're not interested in messages not for us, ignore */ + mac_free_msg(msg); + return; + } + + size = mac_msg_size(msg); + + CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", + msg, type, size, mac_msg_mblks(msg)); + + if( size < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (ksci->ksci_shuttingdown) + return; + CERROR("kscimacnal: did not receive complete portal header," + "size= %ld\n", size); + /* Free the message before exiting */ + mac_free_msg(msg); + return; + } + + /* Provide everything we know */ + krx.handle = handle; + krx.msg = msg; + krx.type = type; + krx.userdata = userdata; + + /* mac_msg_next returns the next mblk with unread data */ + hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) ); + + if(!hdr) { + CERROR("kscimacnal: no data block in message %p\n", msg); + mac_free_msg(msg); + return; + } + + if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) { + PROF_START(lib_parse); + /* sets wanted_len, iovs etc and calls our callback */ + lib_parse(&kscimacnal_lib, hdr, &krx); + PROF_FINISH(lib_parse); +#if 0 /* FIXME: Is it possible to detect this? */ + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx:" + "target is a peer\n", + hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); +#endif /* if 0 FIXME */ + } else { + /* forward to gateway */ + CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n", + kscimacnal_lib.ni.nid, hdr->dest_nid); + } + + mac_free_msg(msg); + + CDEBUG(D_NET, "msg %p: Done\n", msg); +} + + +/* Called by portals to process a recieved packet */ +static int kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + kscimacnal_rx_t *krx = private; + mac_mblk_t *mblk; + void *src; + mac_size_t pkt_len; + ptl_size_t iovused=0; + + LASSERT (krx != NULL); + LASSERT (krx->msg != NULL); + + CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n", + krx->msg, mlen, rlen, niov); + + /* What was actually received must be >= what sender claims to have + * sent. This is an LASSERT, since lib-move doesn't check cb return + * code yet. Also, rlen seems to be negative when mlen==0 so don't + * assert on that. + */ + LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); + LASSERT (mlen==0 || mlen <= rlen); + + PROF_START(memcpy); + + /* mac_msg_next returns next mblk with unread data (ie. can + * be same mblk */ + while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) { + pkt_len = mac_mblk_len(mblk); + src = mac_get_mblk(mblk, pkt_len); /* Next unread block */ + + CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld src: %p\n", + krx->msg, mblk, pkt_len, src); + + LASSERT(src != NULL); + + /* Essentially lib_copy_buf2iov but with continuation support, + * we "gracefully" thrash the argument vars ;) */ + while (pkt_len > 0) { + ptl_size_t nob; + + LASSERT (niov > 0); + + LASSERT(iovused < iov->iov_len); + + nob = MIN (iov->iov_len-iovused, pkt_len); + CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + "iovused: %d\n", + iov->iov_base, iov->iov_len, + src, nob, iovused); + + memcpy (iov->iov_base+iovused, src, nob); + pkt_len -= nob; + src += nob; + + if(nob+iovused < iov->iov_len) { + /* We didn't use all of the iov */ + iovused+=nob; + } + else { + niov--; + iov++; + iovused=0; + } + } + } + PROF_FINISH(memcpy); + + CDEBUG(D_NET, "Calling lib_finalize.\n"); + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + CDEBUG(D_NET, "Done.\n"); + + return rlen; +} + + +nal_cb_t kscimacnal_lib = { + nal_data: &kscimacnal_data, /* NAL private data */ + cb_send: kscimacnal_send, + cb_send_pages: NULL, /* Ignore for now */ + cb_recv: kscimacnal_recv, + cb_recv_pages: NULL, + cb_read: kscimacnal_read, + cb_write: kscimacnal_write, + cb_malloc: kscimacnal_malloc, + cb_free: kscimacnal_free, + cb_printf: kscimacnal_printf, + cb_cli: kscimacnal_cli, + cb_sti: kscimacnal_sti, + cb_dist: kscimacnal_dist +}; diff --git a/lustre/portals/knals/socknal/Makefile.am b/lustre/portals/knals/socknal/Makefile.am new file mode 100644 index 0000000..437d7fc --- /dev/null +++ b/lustre/portals/knals/socknal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ksocknal +modulenet_DATA = ksocknal.o +EXTRA_PROGRAMS = ksocknal + +DEFS = +ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lustre/portals/knals/socknal/Makefile.mk b/lustre/portals/knals/socknal/Makefile.mk new file mode 100644 index 0000000..46edf01 --- /dev/null +++ b/lustre/portals/knals/socknal/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Kernelenv + +obj-y += ksocknal.o +ksocknal-objs := socknal.o socknal_cb.o + diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c new file mode 100644 index 0000000..d15d8c8 --- /dev/null +++ b/lustre/portals/knals/socknal/socknal.c @@ -0,0 +1,863 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +ptl_handle_ni_t ksocknal_ni; +static nal_t ksocknal_api; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +ksock_nal_data_t ksocknal_data; +#else +static ksock_nal_data_t ksocknal_data; +#endif + +kpr_nal_interface_t ksocknal_router_interface = { + kprni_nalid: SOCKNAL, + kprni_arg: &ksocknal_data, + kprni_fwd: ksocknal_fwd_packet, +}; + + +int +ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ + return PTL_OK; +} + +int +ksocknal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ksocknal_close_sock(0); /* close all sockets */ +} + +void +ksocknal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ksocknal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ksocknal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ksocknal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ksocknal_data.ksnd_mynid); + lib_init(&ksocknal_lib, ksocknal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ksocknal_api); +} + +/* + * EXTRA functions follow + */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ksocknal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ksocknal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->nid); + + ksocknal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +void +ksocknal_bind_irq (unsigned int irq, int cpu) +{ +#if (defined(CONFIG_SMP) && CPU_AFFINITY) + char cmdline[64]; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + snprintf (cmdline, sizeof (cmdline), + "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + + printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); + + /* FIXME: Find a better method of setting IRQ affinity... + */ + + call_usermodehelper (argv[0], argv, envp); +#endif +} + +int +ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + ksock_sched_t *sched = NULL; + unsigned int irq = 0; + struct net_device *dev = NULL; + int ret; + int idx; + ENTRY; + + LASSERT (!in_interrupt()); + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->data_ready; + conn->ksnc_saved_write_space = sock->sk->write_space; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + +#warning check it is OK to derefence sk->dst_cache->dev like this... + lock_sock (conn->ksnc_sock->sk); + + if (conn->ksnc_sock->sk->dst_cache != NULL) { + dev = conn->ksnc_sock->sk->dst_cache->dev; + if (dev != NULL) { + irq = dev->irq; + if (irq >= NR_IRQS) { + CERROR ("Unexpected IRQ %x\n", irq); + irq = 0; + } + } + } + + release_sock (conn->ksnc_sock->sk); + + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (irq == 0 || + ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { + /* This is a software NIC, or we haven't associated it with + * a CPU yet */ + + /* Choose the CPU with the fewest connections */ + sched = ksocknal_data.ksnd_schedulers; + for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[idx].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[idx]; + + if (irq != 0) { /* Hardware NIC */ + /* Remember which scheduler we chose */ + idx = sched - ksocknal_data.ksnd_schedulers; + + LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + + if (bind_irq) /* remember if we will bind below */ + idx |= SOCKNAL_IRQ_BOUND; + + ksocknal_data.ksnd_irq_info[irq] = idx; + } + } else { + /* This is a hardware NIC, associated with a CPU */ + idx = ksocknal_data.ksnd_irq_info[irq]; + + /* Don't bind again if we've bound already */ + if ((idx & SOCKNAL_IRQ_BOUND) != 0) + bind_irq = 0; + + sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + } + + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (bind_irq && /* irq binding required */ + irq != 0) /* hardware NIC */ + ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + + /* NOW it's safe to get called back when socket is ready... */ + sock->sk->user_data = conn; + sock->sk->data_ready = ksocknal_data_ready; + sock->sk->write_space = ksocknal_write_space; + + /* ...which I call right now to get things going */ + ksocknal_data_ready (sock->sk, 0); + ksocknal_write_space (sock->sk); + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ksocknal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid == 0) { /* close ALL connections */ + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ksocknal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ksocknal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid && list_empty (&death_row)) + return (-ENOENT); + + while (!list_empty (&death_row)) { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + /* NB I _have_ to restore the callback, rather than storing + * a noop, since the socket could survive past this module + * being unloaded!! */ + conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space; + + /* OK; no more callbacks, but they could be in progress now, + * so wait for them to complete... */ + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + /* ...however if I get the lock before a callback gets it, + * this will make them noop + */ + conn->ksnc_sock->sk->user_data = NULL; + + /* And drop the scheduler's connection count while I've got + * the exclusive lock */ + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, + flags); + + ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ + } + + return (0); +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + return &(sk->tp_pinfo.af_tcp); +} +#else +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + struct tcp_sock *s = (struct tcp_sock *)sk; + return &s->tcp; +} +#endif + +void +ksocknal_push_conn (ksock_conn_t *conn) +{ + struct sock *sk = conn->ksnc_sock->sk; + struct tcp_opt *tp = sock2tcp_opt(sk); + int nonagle; + int val = 1; + int rc; + mm_segment_t oldmm; + + lock_sock (sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock (sk); + + oldmm = get_fs (); + set_fs (KERNEL_DS); + + rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof (val)); + LASSERT (rc == 0); + + set_fs (oldmm); + + lock_sock (sk); + tp->nonagle = nonagle; + release_sock (sk); +} + +/* Passing in a zero nid pushes all connections */ +int +ksocknal_push_sock (ptl_nid_t nid) +{ + ksock_conn_t *conn; + struct list_head *tmp; + int index; + int i; + + if (nid != 0) { + conn = ksocknal_get_conn (nid); + + if (conn == NULL) + return (-ENOENT); + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + + return (0); + } + + /* NB we can't remove connections from the socket list so we have to + * cope with them being removed from under us... + */ + for (index = 0; ; index++) { + read_lock (&ksocknal_data.ksnd_socklist_lock); + + i = 0; + conn = NULL; + + list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); // take a ref + break; + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + if (conn == NULL) + break; + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + } + + return (0); +} + +ksock_conn_t * +ksocknal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ksocknal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", + nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ksocknal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready); + LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space); + LASSERT (conn->ksnc_sock->sk->user_data == NULL); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) { + ksocknal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} + +int +ksocknal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, + data->ioc_flags); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ksocknal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ksocknal_set_mynid (data->ioc_nid); + break; + } + case NAL_CMD_PUSH_CONNECTION: { + rc = ksocknal_push_sock (data->ioc_nid); + break; + } + } + + return rc; +} + +void +ksocknal_free_buffers (void) +{ + if (ksocknal_data.ksnd_fmbs != NULL) { + ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; + i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); + i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ksocknal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ksocknal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ksocknal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + + SOCKNAL_NNBLK_LTXS)); + + if (ksocknal_data.ksnd_schedulers != NULL) + PORTAL_FREE (ksocknal_data.ksnd_schedulers, + sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); +} + +void __exit +ksocknal_module_fini (void) +{ + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(SOCKNAL); + PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ksocknal_ni); + lib_fini(&ksocknal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); + LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = + &ksocknal_data.ksnd_schedulers[i]; + + LASSERT (list_empty (&kss->kss_tx_conns)); + LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (kss->kss_nconns == 0); + } + + /* stop router calling me */ + kpr_shutdown (&ksocknal_data.ksnd_router); + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_reaper_waitq); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) + wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + + while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ksocknal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ksocknal_data.ksnd_router); + + ksocknal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + + +int __init +ksocknal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ksocknal_api.forward = ksocknal_api_forward; + ksocknal_api.shutdown = ksocknal_api_shutdown; + ksocknal_api.yield = ksocknal_api_yield; + ksocknal_api.validate = NULL; /* our api validate is a NOOP */ + ksocknal_api.lock = ksocknal_api_lock; + ksocknal_api.unlock = ksocknal_api_unlock; + ksocknal_api.nal_data = &ksocknal_data; + + ksocknal_lib.nal_data = &ksocknal_data; + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); + rwlock_init(&ksocknal_data.ksnd_socklist_lock); + + ksocknal_data.ksnd_nal_cb = &ksocknal_lib; + spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); + + spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, + sizeof (ksocknal_data.ksnd_irq_info)); + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + + PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); + if (ksocknal_data.ksnd_schedulers == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; + + spin_lock_init (&kss->kss_lock); + INIT_LIST_HEAD (&kss->kss_rx_conns); + INIT_LIST_HEAD (&kss->kss_tx_conns); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&kss->kss_zctxdone_list); +#endif + init_waitqueue_head (&kss->kss_waitq); + } + + CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, + sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS)); + if (ksocknal_data.ksnd_ltxs == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ksocknal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ksocknal_data.ksnd_idle_ltx_list : + &ksocknal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ksocknal_ni, ~0); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + rc = ksocknal_thread_start (ksocknal_scheduler, + &ksocknal_data.ksnd_schedulers[i]); + if (rc != 0) { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", + i, rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + + rc = ksocknal_thread_start (ksocknal_reaper, NULL); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ksocknal_data.ksnd_router, + &ksocknal_router_interface); + if (rc != 0) { + CDEBUG(D_NET, "Can't initialise routing interface " + "(rc = %d): not routing\n", rc); + } else { + /* Only allocate forwarding buffers if I'm on a gateway */ + + PORTAL_ALLOC(ksocknal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + if (ksocknal_data.ksnd_fmbs == NULL) { + ksocknal_module_fini (); + RETURN(-ENOMEM); + } + + /* NULL out buffer pointers etc */ + memset(ksocknal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS); i++) { + ksock_fmb_t *fmb = + &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; + } else { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) { + fmb->fmb_pages[j] = alloc_page (GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + LASSERT(page_address (fmb->fmb_pages[j]) != + NULL); + } + + list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + } + + rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + ksocknal_module_fini (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(ksocknal_ni); + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " + "mem %d)\n", + kpr_routing (&ksocknal_data.ksnd_router) ? + "enabled" : "disabled", pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); + +EXPORT_SYMBOL (ksocknal_ni); diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h new file mode 100644 index 0000000..46ee3b7 --- /dev/null +++ b/lustre/portals/knals/socknal/socknal.h @@ -0,0 +1,293 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +typedef struct /* pool of forwarding buffers */ +{ + spinlock_t fmp_lock; /* serialise */ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + struct list_head kss_tx_conns; /* conn waiting to be written */ +#if SOCKNAL_ZC + struct list_head kss_zctxdone_list; /* completed ZC transmits */ +#endif + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + int kss_nconns; /* # connections assigned to this scheduler */ +} ksock_sched_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + ksock_sched_t *ksnd_schedulers; /* scheduler state */ + + kpr_router_t ksnd_router; /* THE router */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + spinlock_t ksnd_idle_ltx_lock; /* serialise ltx alloc/free */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ +#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ +#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded + * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 + * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 + * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t + * fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, if the message + * requires forwarding or will be received into mapped memory, up to + * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. + * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. + */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet iovec frags */ + struct iovec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + ptl_kiov_t *tx_kiov; /* packet page frags */ +#if SOCKNAL_ZC + ksock_sched_t *tx_sched; /* who to wake on callback */ + zccd_t tx_zccd; /* zero copy callback descriptor */ +#endif +} ksock_tx_t; + +#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the tx frag descriptors: hdr is always 1 iovec + * and payload is PTL_MD_MAX of either type. */ +typedef struct +{ + struct iovec hdr; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } payload; +} ksock_txiovspace_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + ksock_txiovspace_t ltx_iov_space; /* where to stash frag descriptors */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the address of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or PTL_MD_MAX_IOV frags of payload of either type. */ +typedef union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + volatile int ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct iovec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + volatile int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); +extern int ksocknal_close_sock(ptl_nid_t nid); +extern int ksocknal_set_mynid(ptl_nid_t nid); +extern int ksocknal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); +extern void _ksocknal_put_conn (ksock_conn_t *conn); +extern void ksocknal_close_conn (ksock_conn_t *conn); + +static inline void +ksocknal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ksocknal_put_conn (conn); +} + +extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); +extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ksocknal_scheduler (void *arg); +extern int ksocknal_reaper (void *arg); +extern void ksocknal_data_ready(struct sock *sk, int n); +extern void ksocknal_write_space(struct sock *sk); + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c new file mode 100644 index 0000000..388554d --- /dev/null +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -0,0 +1,1612 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +atomic_t ksocknal_packets_received; +atomic_t ksocknal_packets_launched; +atomic_t ksocknal_packets_being_sent; + +#if SOCKNAL_ZC +int ksocknal_do_zc = 1; +int ksocknal_zc_min_frag = 2048; +#endif + +/* + * LIB functions follow + * + */ +int +ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ksocknal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ksocknal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ksocknal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ksocknal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ksocknal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ksocknal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) { + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) { + if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock, + flags); + + wait_event (ksocknal_data.ksnd_idle_ltx_waitq, + !list_empty (&ksocknal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + return (ltx); +} + +#if SOCKNAL_ZC +struct page * +ksocknal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (page == NULL || + !VALID_PAGE (page)) + return (NULL); + + return (page); +} +#endif + +int +ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +{ + struct iovec *iov = tx->tx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; +#if SOCKNAL_ZC + int offset = vaddr & (PAGE_SIZE - 1); + int zcsize = MIN (fragsize, PAGE_SIZE - offset); + struct page *page; +#endif + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (tx->tx_niov > 0); + more |= (tx->tx_niov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + zcsize >= ksocknal_zc_min_frag && + (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { + + CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", + (void *)vaddr, page, page_address(page), offset, zcsize); + + more |= (zcsize < fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, zcsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + /* NB don't pass tx's iov; sendmsg may or may not update it */ + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_iov++; + tx->tx_niov--; + return (1); +} + +int +ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +{ + ptl_kiov_t *kiov = tx->tx_kiov; + int fragsize = kiov->kiov_len; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_nkiov > 0); + more |= (tx->tx_nkiov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + fragsize >= ksocknal_zc_min_frag) { + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, fragsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + char *addr = ((char *)kmap (page)) + offset; + struct iovec fragiov = {.iov_base = addr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + kunmap (page); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_kiov++; + tx->tx_nkiov--; + return (1); +} + +int +ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +{ + int rc; + int sent_some = 0; + ENTRY; + + LASSERT (!in_interrupt()); + + for (;;) { + if (tx->tx_niov != 0) + rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + else + rc = ksocknal_send_kiov (sock, tx, more); + + /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ + if (rc <= 0) /* error or partial send */ + RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); + + if (tx->tx_nob == 0) /* sent everything */ + RETURN (0); + + sent_some = 1; + } +} + +int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_iov++; + conn->ksnc_rx_niov--; + return (1); +} + +int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + unsigned long vaddr = ((unsigned long)kmap (page)) + offset; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + LASSERT (conn->ksnc_rx_nkiov > 0); + LASSERT (offset + fragsize <= PAGE_SIZE); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + kunmap (page); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_kiov++; + conn->ksnc_rx_nkiov--; + return (1); +} + +int +ksocknal_recvmsg (ksock_conn_t *conn) +{ + int rc; + int got_some = 0; + ENTRY; + + LASSERT (!in_interrupt ()); + + for (;;) { + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + /* CAVEAT EMPTOR: we return... + * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ + + if (rc <= 0) /* error/EOF or partial receive */ + RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); + + if (conn->ksnc_rx_nob_wanted == 0) + RETURN (1); + + got_some = 0; + } +} + +#if SOCKNAL_ZC +void +ksocknal_zc_callback (zccd_t *zcd) +{ + ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); + ksock_sched_t *sched = tx->tx_sched; + unsigned long flags; + ENTRY; + + /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); + EXIT; +} +#endif + +void +ksocknal_tx_done (ksock_tx_t *tx) +{ + long flags; + ksock_ltx_t *ltx; + ENTRY; + + atomic_dec (&ksocknal_packets_being_sent); + + if (tx->tx_isfwd) { /* was a forwarded packet? */ + kpr_fwd_done (&ksocknal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + EXIT; + return; + } + + /* local send */ + ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + EXIT; +} + +void +ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + LASSERT (!list_empty (&sched->kss_tx_conns)); + conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + /* assume transmit will complete now, so dequeue while I've got lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ksocknal_sendmsg (conn->ksnc_sock, tx, + !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc != 0) { +#warning FIXME: handle socket errors properly + CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + /* kid on for now the whole packet went. + * NB when we handle the error better, we'll still need to + * block for zccd completion. + */ + tx->tx_nob = 0; + } + + if (tx->tx_nob == 0) /* nothing left to send */ + { + /* everything went; assume more can go, so prevent write_space locking */ + conn->ksnc_tx_ready = 1; + + ksocknal_put_conn (conn); /* release packet's ref */ + atomic_inc (&ksocknal_packets_being_sent); +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + /* zccd skbufs are still in-flight. Release my + * initial ref on zccd, so callback can occur */ + zccd_put (&tx->tx_zccd); + } else +#endif + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + } else { + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); +} + +void +ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + unsigned long flags; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. + */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", + ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, + tx->tx_niov, tx->tx_nkiov); + +#if SOCKNAL_ZC + zccd_init (&tx->tx_zccd, ksocknal_zc_callback); + /* NB this sets 1 ref on zccd, so the callback can only occur + * after I've released this ref */ + tx->tx_sched = sched; +#endif + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + + atomic_inc (&ksocknal_packets_launched); +} + +ksock_conn_t * +ksocknal_send_target (ptl_nid_t nid) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + int rc; + + if ((conn = ksocknal_get_conn (nid)) == NULL) { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + return (NULL); + } + + if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { + CERROR ("Can't route to "LPX64": gateway "LPX64 + " is not a peer\n", nid, gatewaynid); + return (NULL); + } + } + + return (conn); +} + +ksock_ltx_t * +ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type) +{ + ksock_ltx_t *ltx; + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) { + CERROR ("Can't allocate tx desc\n"); + return (NULL); + } + + /* Init local send packet (storage for hdr, finalize() args) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + /* Init common ltx_tx */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr); + + /* We always have 1 mapped frag for the header */ + ltx->ltx_tx.tx_niov = 1; + ltx->ltx_tx.tx_iov = <x->ltx_iov_space.hdr; + ltx->ltx_tx.tx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + ltx->ltx_tx.tx_kiov = NULL; + ltx->ltx_tx.tx_nkiov = 0; + + return (ltx); +} + +int +ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it + * + * Also, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) { + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + /* append the payload_iovs to the one pointing at the header */ + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +int +ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) + return (-1); + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + return (-1); + } + + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov; + memcpy (ltx->ltx_tx.tx_kiov, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_nkiov = payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + /* I'm the gateway; must be the last hop */ + if (nid == ksocknal_lib.ni.nid) + nid = fwd->kprfd_target_nid; + + conn = ksocknal_get_conn (nid); + if (conn == NULL) { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + tx->tx_nkiov = 0; + tx->tx_kiov = NULL; + + ksocknal_launch_packet (conn, tx); +} + +int +ksocknal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ksocknal_data.ksnd_nthreads); + return (0); +} + +void +ksocknal_thread_fini (void) +{ + atomic_dec (&ksocknal_data.ksnd_nthreads); +} + +void +ksocknal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ksock_fmb_pool_t *fmp = fmb->fmb_pool; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn = NULL; + ksock_sched_t *sched; + long flags; + + if (error != 0) + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid), + error); + else + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", + NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + + spin_lock_irqsave (&fmp->fmp_lock, flags); + + list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); + + if (!list_empty (&fmp->fmp_blocked_conns)) { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + } + + spin_unlock_irqrestore (&fmp->fmp_lock, flags); + + if (conn == NULL) + return; + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); +} + +ksock_fmb_t * +ksocknal_get_idle_fmb (ksock_conn_t *conn) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + long flags; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (ksocknal_data.ksnd_fmbs != NULL); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; + + spin_lock_irqsave (&pool->fmp_lock, flags); + + if (!list_empty (&pool->fmp_idle_fmbs)) { + fmb = list_entry(pool->fmp_idle_fmbs.next, + ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + spin_unlock_irqrestore (&pool->fmp_lock, flags); + + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + + spin_unlock_irqrestore (&pool->fmp_lock, flags); + return (NULL); +} + + +int +ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + /* copy header */ + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + + if (payload_nob == 0) { /* got complete packet already */ + atomic_inc (&ksocknal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, 1, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* forward it now */ + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + } else { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = + page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, niov, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / + sizeof (struct iovec)); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = + (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = + fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], + (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); + return (0); +} + +void +ksocknal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + if (body_len < 0) { /* length corrupt (overflow) */ + CERROR("dropping packet from "LPX64" for "LPX64": packet " + "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, body_len); + ksocknal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (ksocknal_data.ksnd_fmbs == NULL) { /* not forwarding */ + CERROR("dropping packet from "LPX64" for "LPX64": not " + "forwarding\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) { /* too big to forward */ + CERROR ("dropping packet from "LPX64" for "LPX64 + ": packet size %d too big\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid, body_len); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + /* should have gone direct */ + conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); + if (conn2 != NULL) { + CERROR ("dropping packet from "LPX64" for "LPX64 + ": target is a peer\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + ksocknal_put_conn (conn2); /* drop ref from get above */ + + /* on to next packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + return (1); + } + + /* Set up to skip as much a possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_fmb_t *fmb; + int rc; + + /* NB: sched->ksnc_lock lock held */ + + LASSERT (!list_empty (&sched->kss_rx_conns)); + conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + CDEBUG(D_NET, "sched %p conn %p\n", sched, conn); + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* doesn't need a forwarding buffer */ + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) + goto try_read; + + get_fmb: + fmb = ksocknal_get_idle_fmb (conn); + if (fmb == NULL) { /* conn descheduled waiting for idle fmb */ + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + return; + } + + if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + rc = ksocknal_recvmsg(conn); + + if (rc == 0) + goto out; + if (rc < 0) { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read %p: %d\n", conn, rc); + goto out; + } + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + /* got all I wanted, assume there's more - prevent data_ready locking */ + conn->ksnc_rx_ready = 1; + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: + /* It's not for me */ + if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + ksocknal_fwd_parse (conn); + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + goto get_fmb; /* => go get a fwd msg buffer */ + default: + LBUG (); + } + /* Not Reached */ + } + + PROF_START(lib_parse); + /* sets wanted_len, iovs etc */ + lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ksocknal_packets_received); + /* packet is done now */ + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + NTOH__u64 (conn->ksnc_hdr.dest_nid), + conn->ksnc_rx_nob_left); + + atomic_inc (&ksocknal_packets_received); + + /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ + kpr_fwd_start (&ksocknal_data.ksnd_router, + (kpr_fwd_desc_t *)conn->ksnc_cookie); + + /* no slop in forwarded packets */ + LASSERT (conn->ksnc_rx_nob_left == 0); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* no data there to read? */ + if (!conn->ksnc_rx_ready) { + /* let socket callback schedule again */ + conn->ksnc_rx_scheduled = 0; + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); +} + +int +ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int +ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int ksocknal_scheduler (void *arg) +{ + ksock_sched_t *sched = (ksock_sched_t *)arg; + unsigned long flags; + int rc; + int nloops = 0; + int id = sched - ksocknal_data.ksnd_schedulers; + char name[16]; +#if (CONFIG_SMP && CPU_AFFINITY) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + int cpu = cpu_logical_map(id % num_online_cpus()); +#else +#warning "Take care of architecure specific logical APIC map" + int cpu = 1; /* Have to change later. */ +#endif /* LINUX_VERSION_CODE */ + + set_cpus_allowed (current, 1 << cpu); + id = cpu; +#endif /* CONFIG_SMP && CPU_AFFINITY */ + + snprintf (name, sizeof (name),"ksocknald[%d]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&sched->kss_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + did_something = 1; + /* drops & regains kss_lock */ + ksocknal_process_receive (sched, &flags); + } + + if (!list_empty (&sched->kss_tx_conns)) { + did_something = 1; + /* drops and regains kss_lock */ + ksocknal_process_transmit (sched, &flags); + } +#if SOCKNAL_ZC + if (!list_empty (&sched->kss_zctxdone_list)) { + ksock_tx_t *tx = + list_entry(sched->kss_zctxdone_list.next, + ksock_tx_t, tx_list); + did_something = 1; + + list_del (&tx->tx_list); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, flags); + } +#endif + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_irqrestore (&sched->kss_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ +#if SOCKNAL_ZC + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns) || + !list_empty(&sched->kss_zctxdone_list)); +#else + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns)); +#endif + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&sched->kss_lock, flags); + } + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + ksocknal_thread_fini (); + return (0); +} + +void +ksocknal_data_ready (struct sock *sk, int n) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + ENTRY; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->data_ready != &ksocknal_data_ready); + sk->data_ready (sk, n); + } else if (!conn->ksnc_rx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_rx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_receive may have cleared while I blocked for the lock) */ + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + EXIT; +} + +void +ksocknal_write_space (struct sock *sk) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->write_space != &ksocknal_write_space); + sk->write_space (sk); + } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + clear_bit (SOCK_NOSPACE, &sk->socket->flags); + + if (!conn->ksnc_tx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_tx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_transmit may have + cleared while I blocked for the lock) */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && // not being progressed + !list_empty(&conn->ksnc_tx_queue)){//packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); +} + +int +ksocknal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ksocknal_reaper"); + kportal_blockallsigs (); + + while (!ksocknal_data.ksnd_shuttingdown) { + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ksocknal_data.ksnd_reaper_list)) { + conn = NULL; + } else { + conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ksocknal_close_conn (conn); + else { + rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ksocknal_thread_fini (); + return (0); +} + +nal_cb_t ksocknal_lib = { + nal_data: &ksocknal_data, /* NAL private data */ + cb_send: ksocknal_send, + cb_send_pages: ksocknal_send_pages, + cb_recv: ksocknal_recv, + cb_recv_pages: ksocknal_recv_pages, + cb_read: ksocknal_read, + cb_write: ksocknal_write, + cb_callback: ksocknal_callback, + cb_malloc: ksocknal_malloc, + cb_free: ksocknal_free, + cb_printf: ksocknal_printf, + cb_cli: ksocknal_cli, + cb_sti: ksocknal_sti, + cb_dist: ksocknal_dist +}; diff --git a/lustre/portals/knals/toenal/Makefile.am b/lustre/portals/knals/toenal/Makefile.am new file mode 100644 index 0000000..9bfff64 --- /dev/null +++ b/lustre/portals/knals/toenal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ktoenal +modulenet_DATA = ktoenal.o +EXTRA_PROGRAMS = ktoenal + +DEFS = +ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h diff --git a/lustre/portals/knals/toenal/toenal.c b/lustre/portals/knals/toenal/toenal.c new file mode 100644 index 0000000..178ea41 --- /dev/null +++ b/lustre/portals/knals/toenal/toenal.c @@ -0,0 +1,629 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include "toenal.h" + +ptl_handle_ni_t ktoenal_ni; +static nal_t ktoenal_api; +static ksock_nal_data_t ktoenal_data; + +/* +ksocknal_interface_t ktoenal_interface = { + ksni_add_sock: ktoenal_add_sock, + ksni_close_sock: ktoenal_close_sock, + ksni_set_mynid: ktoenal_set_mynid, +}; +*/ + +kpr_nal_interface_t ktoenal_router_interface = { + kprni_nalid: TOENAL, + kprni_arg: &ktoenal_data, + kprni_fwd: ktoenal_fwd_packet, +}; + + +int +ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */ + return PTL_OK; +} + +int +ktoenal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ktoenal_close_sock(0); /* close all sockets */ +} + +void +ktoenal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ktoenal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ktoenal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ktoenal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ktoenal_data.ksnd_mynid); + lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ktoenal_api); +} + +/* + * EXTRA functions follow + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ktoenal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ktoenal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid); + + ktoenal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +int +ktoenal_add_sock (ptl_nid_t nid, int fd) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + int ret; + ENTRY; + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + file->f_flags |= O_NONBLOCK; /* Does this have any conflicts */ + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ktoenal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist); + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + ktoenal_data_ready(conn); + ktoenal_write_space(conn); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + /* Schedule pollthread so that it will poll + * for newly created socket + */ + + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ktoenal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + if (nid == 0) /* close ALL connections */ + { + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ktoenal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ktoenal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + if (list_empty (&death_row)) + return (-ENOENT); + + do { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + ktoenal_put_conn (conn); /* drop ref for ksnd_socklist */ + } while (!list_empty (&death_row)); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + return (0); +} + + +ksock_conn_t * +ktoenal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ktoenal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ktoenal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) + { + ktoenal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list); + wake_up (&ktoenal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); +} + +void +ktoenal_free_buffers (void) +{ + if (ktoenal_data.ksnd_fmbs != NULL) + { + ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ktoenal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ktoenal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); +} + +int +ktoenal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ktoenal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ktoenal_set_mynid (data->ioc_nid); + break; + } + } + + return rc; +} + + +void __exit +ktoenal_module_fini (void) +{ + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ktoenal_data.ksnd_init) + { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(TOENAL); + PORTAL_SYMBOL_UNREGISTER (ktoenal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ktoenal_ni); + lib_fini(&ktoenal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ktoenal_data.ksnd_socklist)); + LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns)); + + kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */ + + /* flag threads to terminate; wake and wait for them to die */ + ktoenal_data.ksnd_shuttingdown = 1; + wake_up_all (&ktoenal_data.ksnd_reaper_waitq); + wake_up_all (&ktoenal_data.ksnd_sched_waitq); + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0) + { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ktoenal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ktoenal_data.ksnd_router); + + ktoenal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +int __init +ktoenal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ktoenal_api.forward = ktoenal_api_forward; + ktoenal_api.shutdown = ktoenal_api_shutdown; + ktoenal_api.yield = ktoenal_api_yield; + ktoenal_api.validate = NULL; /* our api validate is a NOOP */ + ktoenal_api.lock = ktoenal_api_lock; + ktoenal_api.unlock = ktoenal_api_unlock; + ktoenal_api.nal_data = &ktoenal_data; + + ktoenal_lib.nal_data = &ktoenal_data; + + memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist); + rwlock_init(&ktoenal_data.ksnd_socklist_lock); + + ktoenal_data.ksnd_nal_cb = &ktoenal_lib; + spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock); + + spin_lock_init (&ktoenal_data.ksnd_sched_lock); + + init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns); + INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list); + init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq); + spin_lock_init (&ktoenal_data.ksnd_reaper_lock); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */ + + PORTAL_ALLOC(ktoenal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + if (ktoenal_data.ksnd_fmbs == NULL) + RETURN(-ENOMEM); + + /* NULL out buffer pointers etc */ + memset(ktoenal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) + { + ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp; + } + else + { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) + { + fmb->fmb_pages[j] = alloc_page (GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + LASSERT (page_address (fmb->fmb_pages[j]) != NULL); + } + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + + PORTAL_ALLOC(ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + if (ktoenal_data.ksnd_ltxs == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ktoenal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) + { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ktoenal_data.ksnd_idle_ltx_list : + &ktoenal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni); + if (rc != 0) + { + CERROR("ktoenal: PtlNIInit failed: error %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ktoenal_ni, ~0); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */ + + ktoenal_data.ksnd_slistchange = 1; + for (i = 0; i < TOENAL_N_SCHED; i++) + { + rc = ktoenal_thread_start (ktoenal_scheduler, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); + ktoenal_module_fini (); + RETURN (rc); + } + } + + rc = ktoenal_thread_start (ktoenal_reaper, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = ktoenal_thread_start (ktoenal_pollthread, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal pollthread: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ktoenal_data.ksnd_router, + &ktoenal_router_interface); + if (rc != 0) + CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc); + + rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL); + if (rc != 0) + CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n", + rc); + + PORTAL_SYMBOL_REGISTER(ktoenal_ni); + + /* flag everything initialised */ + ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n", + kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ktoenal_module_init); +module_exit(ktoenal_module_fini); + +EXPORT_SYMBOL (ktoenal_ni); diff --git a/lustre/portals/knals/toenal/toenal.h b/lustre/portals/knals/toenal/toenal.h new file mode 100644 index 0000000..f793d3b --- /dev/null +++ b/lustre/portals/knals/toenal/toenal.h @@ -0,0 +1,236 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 32 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +#define TOENAL_N_SCHED 1 + +typedef struct /* pool of forwarding buffers */ +{ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + + kpr_router_t ksnd_router; /* THE router */ + + spinlock_t ksnd_sched_lock; /* serialise packet scheduling */ + wait_queue_head_t ksnd_sched_waitq; /* where scheduler(s) wait */ + + struct list_head ksnd_rx_conns; /* conn waiting to be read */ + struct list_head ksnd_tx_conns; /* conn waiting to be written */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */ + poll_table ksnd_pwait; /* poll wait table for the socket */ + int ksnd_slistchange; /* informs the pollthread that + * the socklist has changed */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet frags */ + struct iovec *tx_iov; /* packet frags */ +} ksock_tx_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + struct iovec ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the addres of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* socket */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # frags */ + struct iovec ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */ + + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + unsigned long ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ktoenal_add_sock (ptl_nid_t nid, int fd); +extern int ktoenal_close_sock(ptl_nid_t nid); +extern int ktoenal_set_mynid(ptl_nid_t nid); +extern int ktoenal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid); +extern void _ktoenal_put_conn (ksock_conn_t *conn); +extern void ktoenal_close_conn (ksock_conn_t *conn); + +static inline void +ktoenal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ktoenal_put_conn (conn); +} + +extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg); +extern int ktoenal_new_packet (ksock_conn_t *conn, int skip); +extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ktoenal_scheduler (void *arg); +extern int ktoenal_reaper (void *arg); +extern int ktoenal_pollthread (void *arg); +extern void ktoenal_data_ready(ksock_conn_t *conn); +extern void ktoenal_write_space(ksock_conn_t *conn); + + +extern nal_cb_t ktoenal_lib; +extern ksock_nal_data_t ktoenal_data; diff --git a/lustre/portals/knals/toenal/toenal_cb.c b/lustre/portals/knals/toenal/toenal_cb.c new file mode 100644 index 0000000..8270196 --- /dev/null +++ b/lustre/portals/knals/toenal/toenal_cb.c @@ -0,0 +1,1220 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include "toenal.h" + +atomic_t ktoenal_packets_received; +long ktoenal_packets_launched; +long ktoenal_packets_transmitted; + +/* + * LIB functions follow + * + */ +int +ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ktoenal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ktoenal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ktoenal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ktoenal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ktoenal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ktoenal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ktoenal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) + { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) + { + if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + wait_event (ktoenal_data.ksnd_idle_ltx_waitq, + !list_empty (&ktoenal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + return (ltx); +} + +int +ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags) +{ + /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't) + */ + mm_segment_t oldmm; + int rc; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + oldmm = get_fs(); + set_fs (KERNEL_DS); + +#ifdef PORTAL_DEBUG + { + int total_nob; + int i; + + for (i = total_nob = 0; i < niov; i++) + total_nob += iov[i].iov_len; + + LASSERT (nob == total_nob); + } +#endif + LASSERT (!in_interrupt()); + + rc = sock->f_op->writev(sock, iov, niov, NULL); + + set_fs (oldmm); + + if (rc > 0) /* sent something? */ + { + nob = rc; /* consume iov */ + for (;;) + { + LASSERT (niov > 0); + + if (iov->iov_len >= nob) + { + iov->iov_len -= nob; + iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); + break; + } + nob -= iov->iov_len; + iov->iov_len = 0; + iov++; + niov--; + } + } + + return (rc); +} + +int +ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread) +{ + /* NB This procedure "consumes" iov (actually tcp_recvmsg does) + */ + mm_segment_t oldmm; + int ret, i, len = 0, origlen = 0; + + PROF_START(our_recvmsg); + for(i = 0; i < niov; i++) { + len += iov[i].iov_len; + if(len >= toread) + break; + } + + if(len >= toread) { + origlen = iov[i].iov_len; + iov[i].iov_len -= (len - toread); + } + else { /* i == niov */ + i = niov - 1; + } + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + ret = sock->f_op->readv(sock, iov, i + 1, NULL); + + set_fs(oldmm); + + if(origlen) + iov[i].iov_len = origlen; + + PROF_FINISH(our_recvmsg); + return ret; +} + +void +ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags) +{ + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + int rc; + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + + /* assume transmit will complete now, so dequeue while I've got the lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0; /* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ktoenal_sendmsg (conn->ksnc_file, + tx->tx_iov, tx->tx_niov, tx->tx_nob, + list_empty (&conn->ksnc_tx_queue) ? + MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE)); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc < 0) /* error */ + { + if (rc == -EAGAIN) /* socket full => */ + rc = 0; /* nothing sent */ + else + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + rc = tx->tx_nob; /* kid on for now whole packet went */ + } + } + + if (rc == tx->tx_nob) /* everything went */ + { + conn->ksnc_tx_ready = 1; /* assume more can go (ASAP) */ + ktoenal_put_conn (conn); /* release packet's ref */ + + if (tx->tx_isfwd) /* was a forwarded packet? */ + { + kpr_fwd_done (&ktoenal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + } + else /* local send */ + { + ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list && + waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq)) + wake_up (&ktoenal_data.ksnd_idle_ltx_waitq); + } + ktoenal_packets_transmitted++; + } + else + { + tx->tx_nob -= rc; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) /* nothing to write */ + { + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); +} + +void +ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + long flags; + int nob = tx->tx_nob; + struct iovec *iov = tx->tx_iov; + int niov = 1; + + LASSERT (nob >= sizeof (ptl_hdr_t)); + + /* Truncate iov to exactly match total packet length + * since socket sendmsg pays no attention to requested length. + */ + for (;;) + { + LASSERT (niov <= tx->tx_niov); + LASSERT (iov->iov_len >= 0); + + if (iov->iov_len >= nob) + { + iov->iov_len = nob; + break; + } + nob -= iov->iov_len; + iov++; + niov++; + } + tx->tx_niov = niov; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) /* not scheduled to send */ + { + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + ktoenal_packets_launched++; + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +int +ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + ksock_ltx_t *ltx; + int rc; + int i; + + /* By this point, as it happens, we have absolutely no idea what + * 'private' is. It might be ksock_nal_data or it might be ksock_conn. + * Ha ha, isn't that a funny joke? + * + * FIXME: this is not the right way to fix this; the right way is to + * always pass in the same kind of structure. This is hard right now. + * To revisit this issue, set a breakpoint in here and watch for when + * it's called from lib_finalize. I think this occurs when we send a + * packet as a side-effect of another packet, such as when an ACK has + * been requested. -phil */ + + CDEBUG(D_NET, "sending "LPSZ" bytes from [%d](%p,%d)... to nid: "LPX64" pid %d\n", + payload_len, payload_niov, + payload_niov > 0 ? payload_iov[0].iov_base : NULL, + payload_niov > 0 ? payload_iov[0].iov_len : 0, + nid, pid); + + if ((conn = ktoenal_get_conn (nid)) == NULL) + { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) + { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (-1); + } + + if ((conn = ktoenal_get_conn (gatewaynid)) == NULL) + { + CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", + nid, gatewaynid); + return (-1); + } + } + + /* This transmit has now got a ref on conn */ + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) + { + CERROR ("Can't allocate tx desc\n"); + ktoenal_put_conn (conn); + return (-1); + } + + /* Init common (to sends and forwards) packet part */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_iov = ltx->ltx_iov; + + /* Init local send packet (storage for hdr, finalize() args, iov) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + ltx->ltx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + for (i = 0; i < payload_niov; i++) + { + ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base; + ltx->ltx_iov[1 + i].iov_len = payload_iov[i].iov_len; + } + + ktoenal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + if (nid == ktoenal_lib.ni.nid) /* I'm the gateway; must be the last hop */ + nid = fwd->kprfd_target_nid; + + conn = ktoenal_get_conn (nid); + if (conn == NULL) + { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + + ktoenal_launch_packet (conn, tx); +} + +int +ktoenal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ktoenal_data.ksnd_nthreads); + return (0); +} + +void +ktoenal_thread_fini (void) +{ + atomic_dec (&ktoenal_data.ksnd_nthreads); +} + +void +ktoenal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn; + long flags; + + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + if (error != 0) + CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + + if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns)) + { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +ksock_fmb_t * +ktoenal_get_idle_fmb (ksock_conn_t *conn) +{ + /* NB called with sched lock held */ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ktoenal_data.ksnd_small_fmp; + else + pool = &ktoenal_data.ksnd_large_fmp; + + if (!list_empty (&pool->fmp_idle_fmbs)) + { + fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + return (NULL); +} + + +int +ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */ + + if (payload_nob == 0) /* got complete packet already */ + { + atomic_inc (&ktoenal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, 1, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + else + { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do + { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, niov, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob); + return (0); +} + +void +ktoenal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + int body_len; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + switch (conn->ksnc_hdr.type) + { + case PTL_MSG_GET: + case PTL_MSG_ACK: + body_len = 0; + break; + case PTL_MSG_PUT: + body_len = conn->ksnc_hdr.msg.put.length; + break; + case PTL_MSG_REPLY: + body_len = conn->ksnc_hdr.msg.reply.length; + break; + default: + /* Unrecognised packet type */ + CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n", + conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + /* Ignore this header and go back to reading a new packet. */ + ktoenal_new_packet (conn, 0); + return; + } + + if (body_len < 0) /* length corrupt */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) /* too big to forward */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, body_len); /* on to new packet (skip this one's body) */ + return; + } + + conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */ + if (conn2 != NULL) + { + CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + ktoenal_put_conn (conn2); /* drop ref from get above */ + + ktoenal_new_packet (conn, body_len); /* on to next packet (skip this one's body) */ + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ktoenal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) /* right at next packet boundary now */ + { + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + return (1); + } + + /* set up to skip as much a possible now */ + /* if there's more left (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + skipped = 0; + niov = 0; + + do + { + nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags) +{ + ksock_fmb_t *fmb; + int len; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* NB: sched lock held */ + CDEBUG(D_NET, "conn %p\n", conn); + + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) /* doesn't need a forwarding buffer */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto try_read; + } + + get_fmb: + /* NB: sched lock held */ + fmb = ktoenal_get_idle_fmb (conn); + if (fmb == NULL) /* conn descheduled waiting for idle fmb */ + return; + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0; /* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + /* NB ktoenal_recvmsg "consumes" the iov passed to it */ + len = ktoenal_recvmsg(conn->ksnc_file, + conn->ksnc_rx_iov, conn->ksnc_rx_niov, + conn->ksnc_rx_nob_wanted); + CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len); + + if (len <= 0) /* nothing ready (EAGAIN) or EOF or error */ + { + if (len != -EAGAIN && /* ! nothing to read now */ + len != 0) /* ! nothing to read ever */ + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read(%d) %p: %d\n", + conn->ksnc_rx_nob_wanted, conn, len); + } + goto out; /* come back when there's data ready */ + } + + LASSERT (len <= conn->ksnc_rx_nob_wanted); + conn->ksnc_rx_nob_wanted -= len; + conn->ksnc_rx_nob_left -= len; + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + conn->ksnc_rx_ready = 1; /* assume there's more to be had */ + + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: + if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */ + { + ktoenal_fwd_parse (conn); + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping this packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto get_fmb; /* => go get a fwd msg buffer */ + default: + } + /* Not Reached */ + LBUG (); + } + + PROF_START(lib_parse); + lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */ + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */ + { + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ktoenal_packets_received); + lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */ + /* Fall through */ + + case SOCKNAL_RX_SLOP: + if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */ + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + atomic_inc (&ktoenal_packets_received); + + /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */ + kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie); + + LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (!conn->ksnc_rx_ready) /* no data there to read? */ + { + conn->ksnc_rx_scheduled = 0; /* let socket callback schedule again */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); +} + +int +ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + int i; + + conn->ksnc_cookie = msg; + + LASSERT (niov <= PTL_MD_MAX_IOV); + for (i = 0; i < niov; i++) + { + conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len; + conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base; + } + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + return (rlen); +} + +int +ktoenal_scheduler (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + int nloops = 0; + + kportal_daemonize ("ktoenal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + while (!ktoenal_data.ksnd_shuttingdown) + { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&ktoenal_data.ksnd_rx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */ + } + + if (!list_empty (&ktoenal_data.ksnd_tx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + + list_del (&conn->ksnc_tx_list); + ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */ + } + + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty (&ktoenal_data.ksnd_rx_conns) || + !list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + ktoenal_thread_fini (); + return (0); +} + + +int +ktoenal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ktoenal_reaper"); + kportal_blockallsigs (); + + while (!ktoenal_data.ksnd_shuttingdown) + { + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ktoenal_data.ksnd_reaper_list)) + conn = NULL; + else + { + conn = list_entry (ktoenal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ktoenal_close_conn (conn); + else { + rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty(&ktoenal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ktoenal_thread_fini (); + return (0); +} + +#define POLLREAD (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI) +#define POLLWRITE (POLLOUT | POLLWRNORM | POLLWRBAND) + +int +ktoenal_pollthread(void *arg) +{ + unsigned int mask; + struct list_head *tmp; + ksock_conn_t *conn; + + /* Save the task struct for waking it up */ + ktoenal_data.ksnd_pollthread_tsk = current; + + kportal_daemonize ("ktoenal_pollthread"); + kportal_blockallsigs (); + + poll_initwait(&ktoenal_data.ksnd_pwait); + + while(!ktoenal_data.ksnd_shuttingdown) { + + set_current_state(TASK_INTERRUPTIBLE); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc(&conn->ksnc_refcount); + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + mask = conn->ksnc_file->f_op->poll(conn->ksnc_file, + ktoenal_data.ksnd_slistchange ? + &ktoenal_data.ksnd_pwait : NULL); + + if(mask & POLLREAD) { + ktoenal_data_ready(conn); + + } + if (mask & POLLWRITE) { + ktoenal_write_space(conn); + + } + if (mask & (POLLERR | POLLHUP)) { + /* Do error processing */ + } + + read_lock (&ktoenal_data.ksnd_socklist_lock); + if(atomic_dec_and_test(&conn->ksnc_refcount)) + _ktoenal_put_conn(conn); + } + ktoenal_data.ksnd_slistchange = 0; + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + if(ktoenal_data.ksnd_slistchange) { + poll_freewait(&ktoenal_data.ksnd_pwait); + poll_initwait(&ktoenal_data.ksnd_pwait); + } + } + poll_freewait(&ktoenal_data.ksnd_pwait); + ktoenal_thread_fini(); + return (0); +} + +void +ktoenal_data_ready (ksock_conn_t *conn) +{ + unsigned long flags; + ENTRY; + + if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail (&conn->ksnc_rx_list, + &ktoenal_data.ksnd_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + /* This is done to avoid the effects of a sequence + * of events in which the rx_ready is lost + */ + conn->ksnc_rx_ready=1; + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } + + EXIT; +} + +void +ktoenal_write_space (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "conn %p%s%s%s\n", + conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued")); + + + if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */ + !conn->ksnc_tx_scheduled) { /* not being progressed */ + + list_add_tail (&conn->ksnc_tx_list, + &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } +} + +nal_cb_t ktoenal_lib = { + nal_data: &ktoenal_data, /* NAL private data */ + cb_send: ktoenal_send, + cb_recv: ktoenal_recv, + cb_read: ktoenal_read, + cb_write: ktoenal_write, + cb_callback: ktoenal_callback, + cb_malloc: ktoenal_malloc, + cb_free: ktoenal_free, + cb_printf: ktoenal_printf, + cb_cli: ktoenal_cli, + cb_sti: ktoenal_sti, + cb_dist: ktoenal_dist +}; diff --git a/lustre/portals/libcfs/Makefile.am b/lustre/portals/libcfs/Makefile.am new file mode 100644 index 0000000..e2e11af --- /dev/null +++ b/lustre/portals/libcfs/Makefile.am @@ -0,0 +1,29 @@ +# Copyright (C) 2001, 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +MODULE = portals +modulenet_DATA = portals.o +EXTRA_PROGRAMS = portals + +LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-not-impl.c lib-pid.c +APILINKS := api-eq.c api-errno.c api-init.c api-md.c api-me.c api-ni.c api-wrap.c +LINKS = $(APILINKS) $(LIBLINKS) +DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej + +$(LINKS): link-stamp +link-stamp: + -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + echo timestamp > link-stamp + +DEFS = +portals_SOURCES = $(LINKS) module.c proc.c debug.c + +# Don't distribute any patched files. +dist-hook: + list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done + +include ../Rules.linux diff --git a/lustre/portals/libcfs/Makefile.mk b/lustre/portals/libcfs/Makefile.mk new file mode 100644 index 0000000..3196ea2 --- /dev/null +++ b/lustre/portals/libcfs/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += libcfs.o +licfs-objs := module.o proc.o debug.o \ No newline at end of file diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c new file mode 100644 index 0000000..6233b8d --- /dev/null +++ b/lustre/portals/libcfs/debug.c @@ -0,0 +1,821 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include + +#define DEBUG_OVERFLOW 1024 +static char *debug_buf = NULL; +static unsigned long debug_size = 0; +static atomic_t debug_off_a = ATOMIC_INIT(0); +static int debug_wrapped; +wait_queue_head_t debug_ctlwq; +#define DAEMON_SND_SIZE (64 << 10) + +/* + * used by the daemon to keep track the offset into debug_buffer for the next + * write to the file. Usually, the daemon is to write out buffer + * from debug_daemon_next_write upto debug_off + * variable usage + * Reader - portals_debug_msg() + * Writer - portals_debug_daemon() + * portals_debug_daemon_start() during daemon init time + * portals_debug_daemon_continue() to reset to debug_off + * portals_debug_clear_buffer() reset to debug_off for clear + * Note that *_start(), *_continue() & *clear_buffer() should serialized; + */ +static atomic_t debug_daemon_next_write; + +/* + * A debug_daemon can be in following states + * stopped - stopped state means there is no debug_daemon running. + * accordingly, it must be in paused state + * a daemon is in !stopped && !paused state after + * "lctl debug_daemon start" creates debug_daemon successfully + * Variable Usage + * Reader - portals_debug_daemon() + * portals_debug_set_daemon() routines + * Writer - portals_debug_set_daemon() routines + * portals_debug_daemon() on IO error + * paused - a debug_daemon state is changed from !paused into paused + * when "lctl debug_daemon paused" is issued + * "lctl debug_daemon continue" gets a daemon into !paused mode + * Reader - portals_debug_set_daemon() routines + * portals_debug_msg() + * Writer - portals_debug_set_daemon() on init + * portals_debug_daemon() + * + * Daemon state diagram. + * (stopped, paused) + * | <-- debug_daemon start + * V + * (!stopped, !paused) + * | <-- debug_daemon pause + * V + * (!stopped, paused) + * | <-- debug_daemon continue + * V + * (!stopped, !paused) + * | <-- debug_daemon stop + * V + * (stopped, paused) + * Overlapped - this is a state when CDEBUG is too fast for the daemon to + * write out the debug_bufferr. That is, debug_off is to + * overlap debug_daemon_next_write; + * Reader - portals_debug_msg() + * Writer - portals_debug_msg() + */ + +/* + * Description on Trace Daemon Synchronization + * + * Three categories of code are synchronizing between each other + * 1. lctl, portals_debug_set_daemon(), the user debug control code, + * as well as portals_debug_clear_buffer() + * 2. CDEBUG, portals_debug_msg(), the debug put messages routine + * 3. Daemon, portals_debug_daemon(), to write out debug log file + * + * + * Three different controls for synchronizations + * + * 1. debug_daemon_semaphore + * The usage of this semaphore is to serialize multiple lctl controls + * in manipulating debug daemon state. The semaphore serves as the + * gatekeeper to allow only one user control thread, at any giving time, + * to access debug daemon state and keeps the other user control requests + * in wait state until the current control request is serviced. + * + * 2. wait_queue_head_t lctl (paired with lctl_event flag) + * Lctl event is the event between portals_debug_set_daemon() and + * portals_debug_daemon(). Lctl is an indicator for portals_debug_daemon() + * to flush data out to file. portals_debug_daemon() is to use lctl event + * as signal channel to wakeup portals_debug_set_daemon() upon flush + * operation is done. + * + * Producer : + * portals_debug_daemon() uses to wake up + * portals_debug_set_daemon(), pause and stop, routines + * Consumer : + * portals_debug_set_daemon(), stop and pause operations, + * wait and sleep on the event + * + * 3. wait_queue_head_t daemon (paired with daemon_event flag) + * This is an event channel to wakeup portals_debug_daemon. Daemon + * wakes up to run whenever there is an event posted. Daemon handles + * 2 types of operations . 1. Writes data out to debug file, 2. Flushes + * file and terminates base on lctl event. + * File operation - + * Daemon is normally in a sleep state. + * Daemon is woken up through daemon event whenever CDEBUG is + * putting data over any 64K boundary. + * File flush and termination - + * On portals_debug_daemon_stop/pause() operations, lctl control + * is to wake up daemon through daemon event. + * + * We can't use sleep_on() and wake_up() to replace daemon event because + * portals_debug_daemon() must catch the wakeup operation posted by + * portals_debug_daemon_stop/pause(). Otherwise, stop and pause may + * stuck in lctl wait event. + * + * Producer : + * a. portals_debug_daemon_pause() and portals_debug_daemon_stop() + * uses the event to wake up portals_debug_daemon() + * b. portals_debug_msg() uses the event to wake up + * portals_debug_daemon() whenever the data output is acrossing + * a 64K bytes boundary. + * Consumer : + * portals_debug_daemon() wakes up upon daemon event. + * + * Sequence for portals_debug_daemon_stop() operation + * + * _Portals_debug_daemon_stop()_ _Daemon_ + * Wait_event(daemon) or running + * Paused = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Set force_flush flag if lctlevnt + * Flush data + * Wakeup_event (lctl) + * Wait_event(daemon) + * Stopped = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Exit daemon loop if (Stopped) + * Wakeup_event (lctl) + * Exit + * Return to user application + * + * + * _Portals_debug_msg()_ _Daemon_ + * Wait_event(daemon) or running + * If (WriteStart<64Kjournal_info; + current->journal_info = NULL; + sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME); + file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for dumping", debug_file_name); + GOTO(out, PTR_ERR(file)); + } else { + printk(KERN_ALERT "dumping log to %s ... writing ...\n", + debug_file_name); + } + + debug_off = atomic_read(&debug_off_a); + oldfs = get_fs(); + set_fs(get_ds()); + if (debug_wrapped) { + rc = file->f_op->write(file, debug_buf + debug_off + 1, + debug_size-debug_off-1, &file->f_pos); + rc += file->f_op->write(file, debug_buf, debug_off + 1, + &file->f_pos); + } else { + rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos); + } + printk("wrote %d bytes\n", rc); + set_fs(oldfs); + + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc) + CERROR("sync returns %d\n", rc); + filp_close(file, 0); +out: + current->journal_info = journal_info; + wake_up(&debug_ctlwq); + return 0; +} + +int portals_debug_daemon(void *arg) +{ + struct file *file; + void *journal_info; + mm_segment_t oldfs; + unsigned long force_flush = 0; + unsigned long size; + int rc; + + kportal_daemonize("ldebug_daemon"); + reparent_to_init(); + journal_info = current->journal_info; + current->journal_info = NULL; + + file = filp_open(debug_daemon_file_path, + O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for logging", debug_daemon_file_path); + GOTO(out1, PTR_ERR(file)); + } else { + printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n", + debug_daemon_file_path); + } + + debug_daemon_state.overlapped = 0; + debug_daemon_state.stopped = 0; + atomic_set(&debug_daemon_state.paused, 0); + oldfs = get_fs(); + set_fs(KERNEL_DS); + while (1) { + unsigned long ending; + unsigned long start, tail; + long delta; + + debug_daemon_state.daemon_event = 0; + + ending = atomic_read(&debug_off_a); + start = atomic_read(&debug_daemon_next_write); + + /* check if paused is imposed by lctl ? */ + force_flush = !debug_daemon_state.lctl_event; + + delta = ending - start; + tail = debug_size - start; + size = (delta >= 0) ? delta : tail; + while (size && (force_flush || (delta < 0) || + (size >= DAEMON_SND_SIZE))) { + if (daemon_file_size_limit) { + int ssize = daemon_file_size_limit - file->f_pos; + if (size > ssize) + size = ssize; + } + + rc = file->f_op->write(file, debug_buf+start, + size, &file->f_pos); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon write error %d\n", rc); + goto out; + } + start += rc; + delta = ending - start; + tail = debug_size - start; + if (tail == 0) + start = 0; + if (delta >= 0) + size = delta; + else + size = (tail == 0) ? ending : tail; + if (daemon_file_size_limit == file->f_pos) { + // file wrapped around + file->f_pos = 0; + } + } + atomic_set(&debug_daemon_next_write, start); + if (force_flush) { + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon sync error %d\n", rc); + goto out; + } + if (debug_daemon_state.stopped) + break; + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + } + wait_event(debug_daemon_state.daemon, + debug_daemon_state.daemon_event); + } +out: + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + set_fs(oldfs); + filp_close(file, 0); + current->journal_info = journal_info; +out1: + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + return 0; +} + +void portals_debug_print(void) +{ + unsigned long dumplen = 64 * 1024; + char *start1, *start2; + char *end1, *end2; + unsigned long debug_off = atomic_read(&debug_off_a); + + start1 = debug_buf + debug_off - dumplen; + if (start1 < debug_buf) { + start1 += debug_size; + end1 = debug_buf + debug_size - 1; + start2 = debug_buf; + end2 = debug_buf + debug_off; + } else { + end1 = debug_buf + debug_off; + start2 = debug_buf + debug_off; + end2 = debug_buf + debug_off; + } + + while (start1 < end1) { + int count = MIN(1024, end1 - start1); + printk("%*s", count, start1); + start1 += 1024; + } + while (start2 < end2) { + int count = MIN(1024, end2 - start2); + printk("%*s", count, start2); + start2 += 1024; + } +} + +void portals_debug_dumplog(void) +{ + int rc; + ENTRY; + + init_waitqueue_head(&debug_ctlwq); + + rc = kernel_thread(portals_do_debug_dumplog, + NULL, CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + printk(KERN_ERR "cannot start dump thread\n"); + return; + } + sleep_on(&debug_ctlwq); +} + +int portals_debug_daemon_start(char *file, unsigned int size) +{ + int rc; + + if (!debug_daemon_state.stopped) + return -EALREADY; + + if (file != NULL) + strncpy(debug_daemon_file_path, file, 1024); + + init_waitqueue_head(&debug_daemon_state.lctl); + init_waitqueue_head(&debug_daemon_state.daemon); + + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + + daemon_file_size_limit = size << 20; + + debug_daemon_state.lctl_event = 0; + rc = kernel_thread(portals_debug_daemon, NULL, 0); + if (rc < 0) { + printk(KERN_ERR "cannot start debug daemon thread\n"); + strncpy(debug_daemon_file_path, "\0", 1); + return rc; + } + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_pause(void) +{ + if (atomic_read(&debug_daemon_state.paused)) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.lctl_event = 0; + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_continue(void) +{ + if (!atomic_read(&debug_daemon_state.paused)) + return -EINVAL; + if (debug_daemon_state.stopped) + return -EINVAL; + + debug_daemon_state.overlapped = 0; + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + atomic_set(&debug_daemon_state.paused, 0); + return 0; +} + +int portals_debug_daemon_stop(void) +{ + if (debug_daemon_state.stopped) + return -EALREADY; + + if (!atomic_read(&debug_daemon_state.paused)) + portals_debug_daemon_pause(); + + debug_daemon_state.lctl_event = 0; + debug_daemon_state.stopped = 1; + + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + + debug_daemon_file_path[0] = '\0'; + return 0; +} + +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *filename, unsigned int size) +{ + int rc = -EINVAL; + + down(&debug_daemon_semaphore); + switch (cmd) { + case DEBUG_DAEMON_START: + if (length && (filename[length -1] != '\0')) { + CERROR("Invalid filename for debug_daemon\n"); + rc = -EINVAL; + break; + } + rc = portals_debug_daemon_start(filename, size); + break; + case DEBUG_DAEMON_STOP: + rc = portals_debug_daemon_stop(); + break; + case DEBUG_DAEMON_PAUSE: + rc = portals_debug_daemon_pause(); + break; + case DEBUG_DAEMON_CONTINUE: + rc = portals_debug_daemon_continue(); + break; + default: + CERROR("unknown set_daemon cmd\n"); + } + up(&debug_daemon_semaphore); + return rc; +} + +static int panic_dumplog(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (handled_panic) + return 0; + else + handled_panic = 1; + + if (in_interrupt()) { + portals_debug_print(); + return 0; + } + + while (current->lock_depth >= 0) + unlock_kernel(); + portals_debug_dumplog(); + return 0; +} + +static struct notifier_block lustre_panic_notifier = { + notifier_call : panic_dumplog, + next : NULL, + priority : 10000 +}; + +int portals_debug_init(unsigned long bufsize) +{ + unsigned long debug_off = atomic_read(&debug_off_a); + if (debug_buf != NULL) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + + debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW); + if (debug_buf == NULL) + return -ENOMEM; + memset(debug_buf, 0, debug_size); + debug_wrapped = 0; + + printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n", + bufsize, debug_buf); + atomic_set(&debug_off_a, debug_off); + notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); + debug_size = bufsize; + + return 0; +} + +int portals_debug_cleanup(void) +{ + notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + portals_debug_daemon_stop(); + + vfree(debug_buf); + atomic_set(&debug_off_a, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +int portals_debug_clear_buffer(void) +{ + unsigned long flags; + unsigned long state; + + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + state = atomic_read(&debug_daemon_state.paused); + if (!state) + portals_debug_daemon_pause(); + spin_lock_irqsave(&portals_debug_lock, flags); + atomic_set(&debug_off_a, 0); + debug_wrapped = 0; + atomic_set(&debug_daemon_next_write, 0); + debug_daemon_state.overlapped = 0; + spin_unlock_irqrestore(&portals_debug_lock, flags); + + if (!state) + atomic_set(&debug_daemon_state.paused, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +/* Debug markers, although printed by S_PORTALS + * should not be be marked as such. + */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int portals_debug_mark_buffer(char *text) +{ + if (debug_buf == NULL) + return -EINVAL; + + CDEBUG(0, "*******************************************************************************\n"); + CDEBUG(0, "DEBUG MARKER: %s\n", text); + CDEBUG(0, "*******************************************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_PORTALS + +__s32 portals_debug_copy_to_user(char *buf, unsigned long len) +{ + int rc; + unsigned long debug_off; + unsigned long flags; + + if (len < debug_size) + return -ENOSPC; + + debug_off = atomic_read(&debug_off_a); + spin_lock_irqsave(&portals_debug_lock, flags); + if (debug_wrapped) { + /* All of this juggling with the 1s is to keep the trailing nul + * (which falls at debug_buf + debug_off) at the end of what we + * copy into user space */ + copy_to_user(buf, debug_buf + debug_off + 1, + debug_size - debug_off - 1); + copy_to_user(buf + debug_size - debug_off - 1, + debug_buf, debug_off + 1); + rc = debug_size; + } else { + copy_to_user(buf, debug_buf, debug_off); + rc = debug_off; + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + + return rc; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) +{ + va_list ap; + unsigned long flags; + int max_nob; + int prefix_nob; + int msg_nob; + struct timeval tv; + unsigned long base_offset; + unsigned long debug_off; + + if (debug_buf == NULL) { + printk("portals_debug_msg: debug_buf is NULL!\n"); + return; + } + + spin_lock_irqsave(&portals_debug_lock, flags); + debug_off = atomic_read(&debug_off_a); + if (!atomic_read(&debug_daemon_state.paused)) { + unsigned long available; + long delta; + long v = atomic_read(&debug_daemon_next_write); + + delta = debug_off - v; + available = (delta>=0) ? debug_size-delta : -delta; + // Check if we still have enough debug buffer for CDEBUG + if (available < DAEMON_SND_SIZE) { + /* Drop CDEBUG packets until enough debug_buffer is + * available */ + if (debug_daemon_state.overlapped) + goto out; + /* If this is the first time, leave a marker in the + * output */ + debug_daemon_state.overlapped = 1; + ap = NULL; + format = "DEBUG MARKER: Debug buffer overlapped\n"; + } else /* More space just became available */ + debug_daemon_state.overlapped = 0; + } + + max_nob = debug_size - debug_off + DEBUG_OVERFLOW; + if (max_nob <= 0) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + printk("logic error in portals_debug_msg: <0 bytes to write\n"); + return; + } + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + do_gettimeofday(&tv); + + prefix_nob = snprintf(debug_buf + debug_off, max_nob, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id(), + tv.tv_sec, tv.tv_usec); + max_nob -= prefix_nob; + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.extern_pid, stack); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.mode.tt.extern_pid, stack); +#else + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d+%lu): ", + file, line, fn, current->pid, stack); +#endif + max_nob -= msg_nob; + + va_start(ap, format); + msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); + + /* Print to console, while msg is contiguous in debug_buf */ + /* NB safely terminated see above */ + if ((mask & D_EMERG) != 0) + printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob); + if ((mask & D_ERROR) != 0) + printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob); + else if (portal_printk) + printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob); + base_offset = debug_off & 0xFFFF; + + debug_off += prefix_nob + msg_nob; + if (debug_off > debug_size) { + memcpy(debug_buf, debug_buf + debug_size, + debug_off - debug_size + 1); + debug_off -= debug_size; + debug_wrapped = 1; + } + + atomic_set(&debug_off_a, debug_off); + if (!atomic_read(&debug_daemon_state.paused) && + ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) { + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + } +out: + spin_unlock_irqrestore(&portals_debug_lock, flags); +} + +void portals_debug_set_level(unsigned int debug_level) +{ + printk("Setting portals debug level to %08x\n", debug_level); + portal_debug = debug_level; +} + +void portals_run_lbug_upcall(char * file, char *fn, int line) +{ + char *argv[6]; + char *envp[3]; + char buf[32]; + int rc; + + ENTRY; + snprintf (buf, sizeof buf, "%d", line); + + argv[0] = portals_upcall; + argv[1] = "LBUG"; + argv[2] = file; + argv[3] = fn; + argv[4] = buf; + argv[5] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " + "/proc/sys/portals/upcall\n", + argv[0], argv[1], argv[2], argv[3], argv[4], rc); + + } else { + CERROR("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); + } +} + + +EXPORT_SYMBOL(portals_debug_dumplog); +EXPORT_SYMBOL(portals_debug_msg); +EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c new file mode 100644 index 0000000..1b9e5bb --- /dev/null +++ b/lustre/portals/libcfs/module.c @@ -0,0 +1,572 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define PORTAL_MINOR 240 + +extern void (kping_client)(struct portal_ioctl_data *); + +struct nal_cmd_handler { + nal_cmd_handler_t nch_handler; + void * nch_private; +}; + +static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +struct semaphore nal_cmd_sem; + +#ifdef PORTAL_DEBUG +void +kportal_assertion_failed (char *expr, char *file, char *func, int line) +{ + unsigned long stack = CDEBUG_STACK(stack); + portals_debug_msg(0, D_EMERG, file, func, line, stack, + "ASSERTION(%s) failed\n", expr); + LBUG(); +} +#endif + +void +kportal_daemonize (char *str) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) + daemonize(str); +#else + daemonize(); + snprintf (current->comm, sizeof (current->comm), "%s", str); +#endif +} + +void +kportal_blockallsigs () +{ + unsigned long flags; + + spin_lock_irqsave (¤t->sigmask_lock, flags); + siginitsetinv (¤t->blocked, 0); + recalc_sigpending (current); + spin_unlock_irqrestore (¤t->sigmask_lock, flags); +} + +/* called when opening /dev/device */ +static int kportal_psdev_open(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + PORTAL_MODULE_USE; + RETURN(0); +} + +/* called when closing /dev/device */ +static int kportal_psdev_release(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + + PORTAL_MODULE_UNUSE; + RETURN(0); +} + +static inline void freedata(void *data, int len) +{ + PORTAL_FREE(data, len); +} + +static int +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_del_route(ptl_nid_t target) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_del_route (target); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) +{ + int gateway_nalid; + ptl_nid_t gateway_nid; + ptl_nid_t lo_nid; + ptl_nid_t hi_nid; + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, + &hi_nid); + + if (rc == 0) { + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + + *gateway_nalidp = (__u32)gateway_nalid; + *gateway_nidp = (__u32)gateway_nid; + *lo_nidp = (__u32)lo_nid; + *hi_nidp = (__u32)hi_nid; + } + + PORTAL_SYMBOL_PUT (kpr_control_interface); + return (rc); +} + +static int +kportal_nal_cmd(int nal, struct portal_ioctl_data *data) +{ + int rc = -EINVAL; + + ENTRY; + + down(&nal_cmd_sem); + if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd); + rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private); + } + up(&nal_cmd_sem); + RETURN(rc); +} + +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + return (PORTAL_SYMBOL_GET(kqswnal_ni)); + case SOCKNAL: + return (PORTAL_SYMBOL_GET(ksocknal_ni)); + case TOENAL: + return (PORTAL_SYMBOL_GET(ktoenal_ni)); + case GMNAL: + return (PORTAL_SYMBOL_GET(kgmnal_ni)); + case TCPNAL: + /* userspace NAL */ + return (NULL); + case SCIMACNAL: + return (PORTAL_SYMBOL_GET(kscimacnal_ni)); + default: + /* A warning to a naive caller */ + CERROR ("unknown nal: %d\n", nal); + return (NULL); + } +} + +void +kportal_put_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + PORTAL_SYMBOL_PUT(kqswnal_ni); + break; + case SOCKNAL: + PORTAL_SYMBOL_PUT(ksocknal_ni); + break; + case TOENAL: + PORTAL_SYMBOL_PUT(ktoenal_ni); + break; + case GMNAL: + PORTAL_SYMBOL_PUT(kgmnal_ni); + break; + case TCPNAL: + /* A lesson to a malicious caller */ + LBUG (); + case SCIMACNAL: + PORTAL_SYMBOL_PUT(kscimacnal_ni); + break; + default: + CERROR ("unknown nal: %d\n", nal); + } +} + +int +kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + if (nal_cmd[nal].nch_handler != NULL) + rc = -EBUSY; + else { + nal_cmd[nal].nch_handler = handler; + nal_cmd[nal].nch_private = private; + } + up(&nal_cmd_sem); + } + return rc; +} + +int +kportal_nal_unregister(int nal) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + nal_cmd[nal].nch_handler = NULL; + nal_cmd[nal].nch_private = NULL; + up(&nal_cmd_sem); + } + return rc; +} + + +static int kportal_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + char buf[1024]; + struct portal_ioctl_data *data; + + ENTRY; + + if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || + _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || + _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + RETURN(-EINVAL); + } + + if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + RETURN(-EINVAL); + } + + data = (struct portal_ioctl_data *)buf; + + switch (cmd) { + case IOC_PORTAL_SET_DAEMON: + RETURN (portals_debug_set_daemon ( + (unsigned int) data->ioc_count, + (unsigned int) data->ioc_inllen1, + (char *) data->ioc_inlbuf1, + (unsigned int) data->ioc_misc)); + case IOC_PORTAL_GET_DEBUG: { + __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1, + data->ioc_plen1); + + if (size < 0) + RETURN(size); + + data->ioc_size = size; + err = copy_to_user((char *)arg, data, sizeof(*data)); + RETURN(err); + } + case IOC_PORTAL_CLEAR_DEBUG: + portals_debug_clear_buffer(); + RETURN(0); + case IOC_PORTAL_PANIC: + if (!capable (CAP_SYS_BOOT)) + RETURN (-EPERM); + panic("debugctl-invoked panic"); + RETURN(0); + case IOC_PORTAL_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + RETURN(-EINVAL); + portals_debug_mark_buffer(data->ioc_inlbuf1); + RETURN(0); + case IOC_PORTAL_PING: { + void (*ping)(struct portal_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n", + data->ioc_count, data->ioc_nid); + ping = PORTAL_SYMBOL_GET(kping_client); + if (!ping) + CERROR("PORTAL_SYMBOL_GET failed\n"); + else { + ping(data); + PORTAL_SYMBOL_PUT(kping_client); + } + RETURN(0); + } + + case IOC_PORTAL_ADD_ROUTE: + CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, data->ioc_nid2, + data->ioc_nid3); + err = kportal_add_route(data->ioc_nal, data->ioc_nid, + MIN (data->ioc_nid2, data->ioc_nid3), + MAX (data->ioc_nid2, data->ioc_nid3)); + break; + + case IOC_PORTAL_DEL_ROUTE: + CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); + err = kportal_del_route (data->ioc_nid); + break; + + case IOC_PORTAL_GET_ROUTE: + CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); + err = kportal_get_route(data->ioc_count, &data->ioc_nal, + &data->ioc_nid, &data->ioc_nid2, + &data->ioc_nid3); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_GET_NID: { + const ptl_handle_ni_t *nip; + ptl_process_id_t pid; + + CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + RETURN (-EINVAL); + + err = PtlGetId (*nip, &pid); + LASSERT (err == PTL_OK); + kportal_put_ni (data->ioc_nal); + + data->ioc_nid = pid.nid; + if (copy_to_user ((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + } + + case IOC_PORTAL_NAL_CMD: + CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal, + data->ioc_nal_cmd); + err = kportal_nal_cmd(data->ioc_nal, data); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_FAIL_NID: { + const ptl_handle_ni_t *nip; + + CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", + data->ioc_nal, data->ioc_nid, data->ioc_count); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + return (-EINVAL); + + err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count); + break; + } + + default: + err = -EINVAL; + break; + } + + RETURN(err); +} + + +static struct file_operations portalsdev_fops = { + ioctl: kportal_ioctl, + open: kportal_psdev_open, + release: kportal_psdev_release +}; + + +static struct miscdevice portal_dev = { + PORTAL_MINOR, + "portals", + &portalsdev_fops +}; + +extern int insert_proc(void); +extern void remove_proc(void); +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +static int init_kportals_module(void) +{ + int rc; + + rc = portals_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "portals_debug_init: %d\n", rc); + return (rc); + } + + sema_init(&nal_cmd_sem, 1); + + rc = misc_register(&portal_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_debug; + } + + rc = PtlInit(); + if (rc) { + CERROR("PtlInit: error %d\n", rc); + goto cleanup_deregister; + } + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_fini; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return (0); + + cleanup_fini: + PtlFini(); + cleanup_deregister: + misc_deregister(&portal_dev); + cleanup_debug: + portals_debug_cleanup(); + return rc; +} + +static void exit_kportals_module(void) +{ + int rc; + + remove_proc(); + PtlFini(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&portal_kmemory)); + + + rc = misc_deregister(&portal_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + + if (atomic_read(&portal_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&portal_kmemory)); + + rc = portals_debug_cleanup(); + if (rc) + printk(KERN_ERR "portals_debug_cleanup: %d\n", rc); +} + +EXPORT_SYMBOL(lib_dispatch); +EXPORT_SYMBOL(PtlMEAttach); +EXPORT_SYMBOL(PtlMEInsert); +EXPORT_SYMBOL(PtlMEUnlink); +EXPORT_SYMBOL(PtlEQAlloc); +EXPORT_SYMBOL(PtlMDAttach); +EXPORT_SYMBOL(PtlMDUnlink); +EXPORT_SYMBOL(PtlNIInit); +EXPORT_SYMBOL(PtlNIFini); +EXPORT_SYMBOL(PtlNIDebug); +EXPORT_SYMBOL(PtlInit); +EXPORT_SYMBOL(PtlFini); +EXPORT_SYMBOL(PtlPut); +EXPORT_SYMBOL(PtlGet); +EXPORT_SYMBOL(ptl_err_str); +EXPORT_SYMBOL(portal_subsystem_debug); +EXPORT_SYMBOL(portal_debug); +EXPORT_SYMBOL(portal_stack); +EXPORT_SYMBOL(portal_printk); +EXPORT_SYMBOL(PtlEQWait); +EXPORT_SYMBOL(PtlEQFree); +EXPORT_SYMBOL(PtlEQGet); +EXPORT_SYMBOL(PtlGetId); +EXPORT_SYMBOL(PtlMDBind); +EXPORT_SYMBOL(lib_iov_nob); +EXPORT_SYMBOL(lib_copy_iov2buf); +EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_kiov_nob); +EXPORT_SYMBOL(lib_copy_kiov2buf); +EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_finalize); +EXPORT_SYMBOL(lib_parse); +EXPORT_SYMBOL(lib_init); +EXPORT_SYMBOL(lib_fini); +EXPORT_SYMBOL(portal_kmemory); +EXPORT_SYMBOL(kportal_daemonize); +EXPORT_SYMBOL(kportal_blockallsigs); +EXPORT_SYMBOL(kportal_nal_register); +EXPORT_SYMBOL(kportal_nal_unregister); +EXPORT_SYMBOL(kportal_assertion_failed); +EXPORT_SYMBOL(dispatch_name); +EXPORT_SYMBOL(kportal_get_ni); +EXPORT_SYMBOL(kportal_put_ni); + +module_init(init_kportals_module); +module_exit (exit_kportals_module); diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c new file mode 100644 index 0000000..2fa739a --- /dev/null +++ b/lustre/portals/libcfs/proc.c @@ -0,0 +1,290 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + +static struct ctl_table_header *portals_table_header = NULL; +extern char debug_file_path[1024]; +extern char debug_daemon_file_path[1024]; +extern char portals_upcall[1024]; + +#define PSDEV_PORTALS (0x100) +#define PSDEV_DEBUG 1 /* control debugging */ +#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ +#define PSDEV_PRINTK 3 /* force all errors to console */ +#define PSDEV_DEBUG_PATH 4 /* crashdump log location */ +#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */ +#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */ + +#define PORTALS_PRIMARY_CTLCNT 6 +static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { + {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, + sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, + {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path, + sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring, + &sysctl_string}, + {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, + sizeof(portals_upcall), 0644, NULL, &proc_dostring, + &sysctl_string}, + {0} +}; + +static struct ctl_table top_table[2] = { + {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, + {0} +}; + + +#ifdef PORTALS_PROFILING +/* + * profiling stuff. we do this statically for now 'cause its simple, + * but we could do some tricks with elf sections to have this array + * automatically built. + */ +#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } + +struct prof_ent prof_ents[] = { + def_prof(our_recvmsg), + def_prof(our_sendmsg), + def_prof(socknal_recv), + def_prof(lib_parse), + def_prof(conn_list_walk), + def_prof(memcpy), + def_prof(lib_finalize), + def_prof(pingcli_time), + def_prof(gmnal_send), + def_prof(gmnal_recv), +}; + +EXPORT_SYMBOL(prof_ents); + +/* + * this function is as crazy as the proc filling api + * requires. + * + * buffer: page allocated for us to scribble in. the + * data returned to the user will be taken from here. + * *start: address of the pointer that will tell the + * caller where in buffer the data the user wants is. + * ppos: offset in the entire /proc file that the user + * currently wants. + * wanted: the amount of data the user wants. + * + * while going, 'curpos' is the offset in the entire + * file where we currently are. We only actually + * start filling buffer when we get to a place in + * the file that the user cares about. + * + * we take care to only sprintf when the user cares because + * we're holding a lock while we do this. + * + * we're smart and know that we generate fixed size lines. + * we only start writing to the buffer when the user cares. + * This is unpredictable because we don't snapshot the + * list between calls that are filling in a file from + * the list. The list could change mid read and the + * output will look very weird indeed. oh well. + */ + +static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, + int *eof, void *data) +{ + int len = 0, i; + int curpos; + char *header = "Interval Cycles_per (Starts Finishes Total)\n"; + int header_len = strlen(header); + char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; + int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); + + *start = buffer; + + if (ppos < header_len) { + int diff = MIN(header_len, wanted); + memcpy(buffer, header + ppos, diff); + len += diff; + ppos += diff; + } + + if (len >= wanted) + goto out; + + curpos = header_len; + + for ( i = 0; i < MAX_PROFS ; i++) { + int copied; + struct prof_ent *pe = &prof_ents[i]; + long long cycles_per; + /* + * find the part of the array that the buffer wants + */ + if (ppos >= (curpos + line_len)) { + curpos += line_len; + continue; + } + /* the clever caller split a line */ + if (ppos > curpos) { + *start = buffer + (ppos - curpos); + } + + if (pe->finishes == 0) + cycles_per = 0; + else + { + cycles_per = pe->total_cycles; + do_div (cycles_per, pe->finishes); + } + + copied = sprintf(buffer + len, format, pe->str, cycles_per, + pe->starts, pe->finishes, pe->total_cycles); + + len += copied; + + /* pad to line len, -1 for \n */ + if ((copied < line_len-1)) { + int diff = (line_len-1) - copied; + memset(buffer + len, ' ', diff); + len += diff; + copied += diff; + } + + buffer[len++]= '\n'; + + /* bail if we have enough */ + if (((buffer + len) - *start) >= wanted) + break; + + curpos += line_len; + } + + /* lameness */ + if (i == MAX_PROFS) + *eof = 1; + out: + + return MIN(((buffer + len) - *start), wanted); +} + +/* + * all kids love /proc :/ + */ +static unsigned char basedir[]="net/portals"; +#endif /* PORTALS_PROFILING */ + +int insert_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + struct proc_dir_entry *ent; + + if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { + CERROR("profiling enum and array are out of sync.\n"); + return -1; + } + + /* + * This is pretty lame. assuming that failure just + * means that they already existed. + */ + strcat(dir, basedir); + create_proc_entry(dir, S_IFDIR, 0); + + strcat(dir, "/cycles"); + ent = create_proc_entry(dir, 0, 0); + if (!ent) { + CERROR("couldn't register %s?\n", dir); + return -1; + } + + ent->data = NULL; + ent->read_proc = prof_read_proc; +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (!portals_table_header) + portals_table_header = register_sysctl_table(top_table, 0); +#endif + + return 0; +} + +void remove_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + int end; + + dir[0]='\0'; + strcat(dir, basedir); + + end = strlen(dir); + + strcat(dir, "/cycles"); + remove_proc_entry(dir,0); + + dir[end] = '\0'; + remove_proc_entry(dir,0); +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (portals_table_header) + unregister_sysctl_table(portals_table_header); + portals_table_header = NULL; +#endif +} diff --git a/lustre/portals/packaging/.cvsignore b/lustre/portals/packaging/.cvsignore new file mode 100644 index 0000000..fd1d56a --- /dev/null +++ b/lustre/portals/packaging/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +aclocal.m4 +config.log +config.status +config.cache +configure +portals.spec diff --git a/lustre/portals/packaging/Makefile.am b/lustre/portals/packaging/Makefile.am new file mode 100644 index 0000000..126bc69 --- /dev/null +++ b/lustre/portals/packaging/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = portals.spec \ No newline at end of file diff --git a/lustre/portals/packaging/portals.spec.in b/lustre/portals/packaging/portals.spec.in new file mode 100644 index 0000000..e196b3f --- /dev/null +++ b/lustre/portals/packaging/portals.spec.in @@ -0,0 +1,116 @@ +%define kversion @RELEASE@ +%define linuxdir @LINUX@ +%define version HEAD + +Summary: Sandia Portals Message Passing - utilities +Name: portals +Version: %{version} +Release: 0210101748uml +Copyright: LGPL +Group: Utilities/System +BuildRoot: /var/tmp/portals-%{version}-root +Source: http://sandiaportals.org/portals-%{version}.tar.gz + +%description +Sandia Portals message passing package. Contains kernel modules, libraries and utilities. + +%package -n portals-modules +Summary: Kernel modules and NAL's for portals +Group: Development/Kernel + +%description -n portals-modules +Object-Based Disk storage drivers for Linux %{kversion}. + +%package -n portals-source +Summary: Portals kernel source for rebuilding with other kernels +Group: Development/Kernel + +%description -n portals-source +Portals kernel source for rebuilding with other kernels + +%prep +%setup -n portals-%{version} + +%build +rm -rf $RPM_BUILD_ROOT + +# Create the pristine source directory. +srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version} +mkdir -p $srcdir +find . -name CVS -prune -o -print | cpio -ap $srcdir + +# Set an explicit path to our Linux tree, if we can. +conf_flag= +linuxdir=%{linuxdir} +test -d $linuxdir && conf_flag=--with-linux=$linuxdir +./configure $conf_flag +make + +%install +make install prefix=$RPM_BUILD_ROOT + +%ifarch alpha +# this hurts me + conf_flag= + linuxdir=%{linuxdir} + test -d $linuxdir && conf_flag=--with-linux=$linuxdir + make clean + ./configure --enable-rtscts-myrinet $conf_flag + make + cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o + cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload +%endif + + +%files +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /usr/sbin/acceptor +%attr(-, root, root) /usr/sbin/ptlctl +%attr(-, root, root) /usr/sbin/debugctl +%ifarch alpha +%attr(-, root, root) /usr/sbin/mcpload +%endif +%attr(-, root, root) /lib/libmyrnal.a +%attr(-, root, root) /lib/libptlapi.a +%attr(-, root, root) /lib/libptlctl.a +%attr(-, root, root) /lib/libprocbridge.a +%attr(-, root, root) /lib/libptllib.a +%attr(-, root, root) /lib/libtcpnal.a +%attr(-, root, root) /lib/libtcpnalutil.a +%attr(-, root, root) /usr/include/portals/*.h +%attr(-, root, root) /usr/include/portals/base/*.h +%attr(-, root, root) /usr/include/linux/*.h + +%files -n portals-modules +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o +%ifarch alpha +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o +%endif +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o + +%files -n portals-source +%attr(-, root, root) /usr/src/portals-%{version} + +%post +if [ ! -e /dev/portals ]; then + mknod /dev/portals c 10 240 +fi +depmod -ae || exit 0 + +grep -q portals /etc/modules.conf || \ + echo 'alias char-major-10-240 portals' >> /etc/modules.conf + +grep -q '/dev/portals' /etc/modules.conf || \ + echo 'alias /dev/portals portals' >> /etc/modules.conf + +%postun +depmod -ae || exit 0 + +%clean +#rm -rf $RPM_BUILD_ROOT + +# end of file diff --git a/lustre/portals/portals/Makefile.am b/lustre/portals/portals/Makefile.am new file mode 100644 index 0000000..9fb7f6f --- /dev/null +++ b/lustre/portals/portals/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include +lib_LIBRARIES= libportals.a +libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-md.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-not-impl.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk new file mode 100644 index 0000000..5627ef7 --- /dev/null +++ b/lustre/portals/portals/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += portals.o +portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c new file mode 100644 index 0000000..57427f6 --- /dev/null +++ b/lustre/portals/portals/api-eq.c @@ -0,0 +1,161 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-eq.c + * User-level event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * PtlMDUpdate is here so that it can access the per-eventq + * structures. + */ + +#include + +int ptl_eq_init(void) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_fini(void) +{ + /* Nothing to do anymore... */ +} + +int ptl_eq_ni_init(nal_t * nal) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_ni_fini(nal_t * nal) +{ + /* Nothing to do anymore... */ +} + +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) +{ + ptl_eq_t *eq; + int rc, new_index; + unsigned long flags; + ptl_event_t *new_event; + nal_t *nal; + ENTRY; + + if (!ptl_init) + RETURN(PTL_NOINIT); + + nal = ptl_hndl2nal(&eventq); + if (!nal) + RETURN(PTL_INV_EQ); + + eq = ptl_handle2usereq(&eventq); + nal->lock(nal, &flags); + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + + new_index = eq->sequence & (eq->size - 1); + new_event = &eq->base[new_index]; + CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->sequence, eq->size); + if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { + nal->unlock(nal, &flags); + RETURN(PTL_EQ_EMPTY); + } + + *ev = *new_event; + + /* Set the unlinked_me interface number if there is one to pass + * back, since the NAL hasn't a clue what it is and therefore can't + * set it. */ + if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) + ev->unlinked_me.nal_idx = eventq.nal_idx; + + /* ensure event is delivered correctly despite possible + races with lib_finalize */ + if (eq->sequence != new_event->sequence) { + CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", + eq->sequence, new_event->sequence); + rc = PTL_EQ_DROPPED; + } else { + rc = PTL_OK; + } + + eq->sequence = new_event->sequence + 1; + nal->unlock(nal, &flags); + RETURN(rc); +} + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) +{ + int rc; + + /* PtlEQGet does the handle checking */ + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + } + + return rc; +} + +#ifndef __KERNEL__ +static jmp_buf eq_jumpbuf; + +static void eq_timeout(int signal) +{ + longjmp(eq_jumpbuf, -1); +} + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + static void (*prev) (int); + static int left_over; + time_t time_at_start; + int rc; + + if (setjmp(eq_jumpbuf)) { + signal(SIGALRM, prev); + alarm(left_over - timeout); + return PTL_EQ_EMPTY; + } + + left_over = alarm(timeout); + prev = signal(SIGALRM, eq_timeout); + time_at_start = time(NULL); + if (left_over < timeout) + alarm(left_over); + + rc = PtlEQWait(eventq_in, event_out); + + signal(SIGALRM, prev); + alarm(left_over); /* Should compute how long we waited */ + + return rc; +} + +#endif + diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c new file mode 100644 index 0000000..5cb0980 --- /dev/null +++ b/lustre/portals/portals/api-errno.c @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-errno.c + * Instantiate the string table of errors + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +/* If you change these, you must update the number table in portals/errno.h */ +const char *ptl_err_str[] = { + "PTL_OK", + "PTL_SEGV", + + "PTL_NOSPACE", + "PTL_INUSE", + "PTL_VAL_FAILED", + + "PTL_NAL_FAILED", + "PTL_NOINIT", + "PTL_INIT_DUP", + "PTL_INIT_INV", + "PTL_AC_INV_INDEX", + + "PTL_INV_ASIZE", + "PTL_INV_HANDLE", + "PTL_INV_MD", + "PTL_INV_ME", + "PTL_INV_NI", +/* If you change these, you must update the number table in portals/errno.h */ + "PTL_ILL_MD", + "PTL_INV_PROC", + "PTL_INV_PSIZE", + "PTL_INV_PTINDEX", + "PTL_INV_REG", + + "PTL_INV_SR_INDX", + "PTL_ML_TOOLONG", + "PTL_ADDR_UNKNOWN", + "PTL_INV_EQ", + "PTL_EQ_DROPPED", + + "PTL_EQ_EMPTY", + "PTL_NOUPDATE", + "PTL_FAIL", + "PTL_NOT_IMPLEMENTED", + "PTL_NO_ACK", + + "PTL_IOV_TOO_MANY", + "PTL_IOV_TOO_SMALL", + + "PTL_EQ_INUSE", + "PTL_MD_INUSE" +}; +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c new file mode 100644 index 0000000..b54f684 --- /dev/null +++ b/lustre/portals/portals/api-init.c @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-init.c + * Initialization and global data for the p30 user side library + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * All handles have their interface number stored in the second 16 bit word + */ + +#include + +int ptl_init; +unsigned int portal_subsystem_debug = 0xfff7e3ff; +unsigned int portal_debug = ~0; +unsigned int portal_printk; +unsigned int portal_stack; + +#ifdef __KERNEL__ +atomic_t portal_kmemory = ATOMIC_INIT(0); +#endif + +int __p30_initialized; +int __p30_myr_initialized; +int __p30_ip_initialized; +ptl_handle_ni_t __myr_ni_handle; +ptl_handle_ni_t __ip_ni_handle; + +int __p30_myr_timeout = 10; +int __p30_ip_timeout; + +int PtlInit(void) +{ + + if (ptl_init) + return PTL_OK; + + ptl_ni_init(); + ptl_me_init(); + ptl_eq_init(); + ptl_init = 1; + __p30_initialized = 1; + + return PTL_OK; +} + + +void PtlFini(void) +{ + + /* Reverse order of initialization */ + ptl_eq_fini(); + ptl_me_fini(); + ptl_ni_fini(); + ptl_init = 0; +} diff --git a/lustre/portals/portals/api-md.c b/lustre/portals/portals/api-md.c new file mode 100644 index 0000000..967112f --- /dev/null +++ b/lustre/portals/portals/api-md.c @@ -0,0 +1,9 @@ +/* + * api-p30/md.c + * + * Memory descriptor functions that need address validation + * There are a few standing issues... + * - Addresses are invalidated by the library without telling us. + */ +#include + diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c new file mode 100644 index 0000000..573e948 --- /dev/null +++ b/lustre/portals/portals/api-me.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-me.c + * Match Entry local operations. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +int ptl_me_init(void) +{ + return PTL_OK; +} +void ptl_me_fini(void) +{ /* Nothing to do */ +} +int ptl_me_ni_init(nal_t * nal) +{ + return PTL_OK; +} + +void ptl_me_ni_fini(nal_t * nal) +{ /* Nothing to do... */ +} diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c new file mode 100644 index 0000000..952da4f --- /dev/null +++ b/lustre/portals/portals/api-ni.c @@ -0,0 +1,184 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-ni.c + * Network Interface code + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +#define MAX_NIS 8 +static nal_t *ptl_interfaces[MAX_NIS]; +int ptl_num_interfaces = 0; + +nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) +{ + unsigned int idx = handle->nal_idx; + + /* XXX we really rely on the caller NOT racing with interface + * setup/teardown. That ensures her NI handle can't get + * invalidated out from under her (or worse, swapped for a + * completely different interface!) */ + + if (idx < MAX_NIS) + return ptl_interfaces[idx]; + + return NULL; +} + +int ptl_ni_init(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) + ptl_interfaces[i] = NULL; + + return PTL_OK; +} + +void ptl_ni_fini(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) { + nal_t *nal = ptl_interfaces[i]; + if (!nal) + continue; + + if (nal->shutdown) + nal->shutdown(nal, i); + } +} + +#ifdef __KERNEL__ +DECLARE_MUTEX(ptl_ni_init_mutex); + +static void ptl_ni_init_mutex_enter (void) +{ + down (&ptl_ni_init_mutex); +} + +static void ptl_ni_init_mutex_exit (void) +{ + up (&ptl_ni_init_mutex); +} + +#else +static void ptl_ni_init_mutex_enter (void) +{ +} + +static void ptl_ni_init_mutex_exit (void) +{ +} + +#endif + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, ptl_pid_t requested_pid, + ptl_handle_ni_t * handle) +{ + nal_t *nal; + int i; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid); + + if (!nal) { + ptl_ni_init_mutex_exit (); + return PTL_NAL_FAILED; + } + + for (i = 0; i < ptl_num_interfaces; i++) { + if (ptl_interfaces[i] == nal) { + nal->refct++; + handle->nal_idx = i; + fprintf(stderr, "Returning existing NAL (%d)\n", i); + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + } + nal->refct = 1; + + handle->nal_idx = ptl_num_interfaces; + if (ptl_num_interfaces >= MAX_NIS) { + if (nal->shutdown) + nal->shutdown (nal, ptl_num_interfaces); + ptl_ni_init_mutex_exit (); + return PTL_NOSPACE; + } + + ptl_interfaces[ptl_num_interfaces++] = nal; + + ptl_eq_ni_init(nal); + ptl_me_ni_init(nal); + + ptl_ni_init_mutex_exit (); + return PTL_OK; +} + + +int PtlNIFini(ptl_handle_ni_t ni) +{ + nal_t *nal; + int rc; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = ptl_hndl2nal (&ni); + if (nal == NULL) { + ptl_ni_init_mutex_exit (); + return PTL_INV_HANDLE; + } + + nal->refct--; + if (nal->refct > 0) { + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + + ptl_me_ni_fini(nal); + ptl_eq_ni_fini(nal); + + rc = PTL_OK; + if (nal->shutdown) + rc = nal->shutdown(nal, ni.nal_idx); + + ptl_interfaces[ni.nal_idx] = NULL; + ptl_num_interfaces--; + + ptl_ni_init_mutex_exit (); + return rc; +} + +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) +{ + *ni_out = handle_in; + + return PTL_OK; +} diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c new file mode 100644 index 0000000..cbd4d1f --- /dev/null +++ b/lustre/portals/portals/api-wrap.c @@ -0,0 +1,601 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-wrap.c + * User-level wrappers that dispatch across the protection boundaries + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Assumes the handle encodes the network number in the second 16 bit word + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, + int argsize, void *retbuf, int retsize) +{ + nal_t *nal; + + if (!ptl_init) { + fprintf(stderr, "PtlGetId: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(&any_h); + if (!nal) + return PTL_INV_HANDLE; + + nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); + + return PTL_OK; +} + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) +{ + PtlGetId_in args; + PtlGetId_out ret; + int rc; + + args.handle_in = ni_handle; + + rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return rc; + + if (id) + *id = ret.id_out; + + return ret.rc; +} + +int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) +{ + PtlFailNid_in args; + PtlFailNid_out ret; + int rc; + + args.interface = interface; + args.nid = nid; + args.threshold = threshold; + + rc = do_forward (interface, PTL_FAILNID, + &args, sizeof(args), &ret, sizeof (ret)); + + return ((rc != PTL_OK) ? rc : ret.rc); +} + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out) +{ + PtlNIStatus_in args; + PtlNIStatus_out ret; + int rc; + + args.interface_in = interface_in; + args.register_in = register_in; + + rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (status_out) + *status_out = ret.status_out; + + return ret.rc; +} + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out) +{ + PtlNIDist_in args; + PtlNIDist_out ret; + int rc; + + args.interface_in = interface_in; + args.process_in = process_in; + + rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (distance_out) + *distance_out = ret.distance_out; + + return ret.rc; +} + + + +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in) +{ + PtlNIDebug_in args; + PtlNIDebug_out ret; + int rc; + + args.mask_in = mask_in; + + rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) +{ + PtlMEAttach_in args; + PtlMEAttach_out ret; + int rc; + + args.interface_in = interface_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = pos_in; + + rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = interface_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + + return ret.rc; +} + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out) +{ + PtlMEInsert_in args; + PtlMEInsert_out ret; + int rc; + + args.current_in = current_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = position_in; + + rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = current_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMEUnlink(ptl_handle_me_t current_in) +{ + PtlMEUnlink_in args; + PtlMEUnlink_out ret; + int rc; + + args.current_in = current_in; + args.unlink_in = PTL_RETAIN; + + rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +int PtlTblDump(ptl_handle_ni_t ni, int index_in) +{ + PtlTblDump_in args; + PtlTblDump_out ret; + int rc; + + args.index_in = index_in; + + rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEDump(ptl_handle_me_t current_in) +{ + PtlMEDump_in args; + PtlMEDump_out ret; + int rc; + + args.current_in = current_in; + + rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) +{ + nal_t *nal; + int rc; + int i; + + if (!ptl_init) { + fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(¤t_in); + if (!nal) + return PTL_INV_HANDLE; + + if (nal->validate != NULL) /* nal->validate not a NOOP */ + { + if ((md_in.options & PTL_MD_IOV) == 0) /* contiguous */ + { + rc = nal->validate (nal, md_in.start, md_in.length); + if (rc) + return (PTL_SEGV); + } + else + { + struct iovec *iov = (struct iovec *)md_in.start; + + for (i = 0; i < md_in.niov; i++, iov++) + { + rc = nal->validate (nal, iov->iov_base, iov->iov_len); + if (rc) + return (PTL_SEGV); + } + } + } + + return 0; +} + +static ptl_handle_eq_t md2eq (ptl_md_t *md) +{ + if (PtlHandleEqual (md->eventq, PTL_EQ_NONE)) + return (PTL_EQ_NONE); + + return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); +} + + +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) +{ + PtlMDAttach_in args; + PtlMDAttach_out ret; + int rc; + + rc = validate_md(me_in, md_in); + if (rc == PTL_OK) { + args.eq_in = md2eq(&md_in); + args.me_in = me_in; + args.md_in = md_in; + args.unlink_in = unlink_in; + + rc = do_forward(me_in, PTL_MDATTACH, + &args, sizeof(args), &ret, sizeof(ret)); + } + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = me_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + + + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out) +{ + PtlMDBind_in args; + PtlMDBind_out ret; + int rc; + + rc = validate_md(ni_in, md_in); + if (rc != PTL_OK) + return rc; + + args.eq_in = md2eq(&md_in); + args.ni_in = ni_in; + args.md_in = md_in; + + rc = do_forward(ni_in, PTL_MDBIND, + &args, sizeof(args), &ret, sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = ni_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +{ + PtlMDUpdate_internal_in args; + PtlMDUpdate_internal_out ret; + int rc; + + args.md_in = md_in; + + if (old_inout) { + args.old_inout = *old_inout; + args.old_inout_valid = 1; + } else + args.old_inout_valid = 0; + + if (new_inout) { + rc = validate_md (md_in, *new_inout); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + args.new_inout = *new_inout; + args.new_inout_valid = 1; + } else + args.new_inout_valid = 0; + + if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) { + args.testq_in = PTL_EQ_NONE; + args.sequence_in = -1; + } else { + ptl_eq_t *eq = ptl_handle2usereq (&testq_in); + + args.testq_in = eq->cb_eq_handle; + args.sequence_in = eq->sequence; + } + + rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + if (old_inout) + *old_inout = ret.old_inout; + + return ret.rc; +} + +int PtlMDUnlink(ptl_handle_md_t md_in) +{ + PtlMDUnlink_in args; + PtlMDUnlink_out ret; + int rc; + + args.md_in = md_in; + rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + return ret.rc; +} + +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out) +{ + ptl_eq_t *eq = NULL; + ptl_event_t *ev = NULL; + PtlEQAlloc_in args; + PtlEQAlloc_out ret; + int rc, i; + nal_t *nal; + + if (!ptl_init) + return PTL_NOINIT; + + nal = ptl_hndl2nal (&interface); + if (nal == NULL) + return PTL_INV_HANDLE; + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + + PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); + if (!ev) + return PTL_NOSPACE; + + for (i = 0; i < count; i++) + ev[i].sequence = 0; + + if (nal->validate != NULL) { + rc = nal->validate(nal, ev, count * sizeof(ptl_event_t)); + if (rc != PTL_OK) + goto fail; + } + + args.ni_in = interface; + args.count_in = count; + args.base_in = ev; + args.len_in = count * sizeof(*ev); + args.callback_in = callback; + + rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + goto fail; + if (ret.rc) + GOTO(fail, rc = ret.rc); + + PORTAL_ALLOC(eq, sizeof(*eq)); + if (!eq) { + rc = PTL_NOSPACE; + goto fail; + } + + eq->sequence = 1; + eq->size = count; + eq->base = ev; + + /* EQ handles are a little wierd. PtlEQGet() just looks at the + * queued events in shared memory. It doesn't want to do_forward() + * at all, so the cookie in the EQ handle we pass out of here is + * simply a pointer to the event queue we just set up. We stash + * the handle returned by do_forward(), so we can pass it back via + * do_forward() when we need to. */ + + eq->cb_eq_handle.nal_idx = interface.nal_idx; + eq->cb_eq_handle.cookie = ret.handle_out.cookie; + + handle_out->nal_idx = interface.nal_idx; + handle_out->cookie = (__u64)((unsigned long)eq); + return PTL_OK; + +fail: + PORTAL_FREE(ev, count * sizeof(ptl_event_t)); + return rc; +} + +int PtlEQFree(ptl_handle_eq_t eventq) +{ + PtlEQFree_in args; + PtlEQFree_out ret; + ptl_eq_t *eq; + int rc; + + eq = ptl_handle2usereq (&eventq); + args.eventq_in = eq->cb_eq_handle; + + rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, + sizeof(args), &ret, sizeof(ret)); + + /* XXX we're betting rc == PTL_OK here */ + PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); + PORTAL_FREE(eq, sizeof(*eq)); + + return rc; +} + +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) +{ + PtlACEntry_in args; + PtlACEntry_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.ni_in = ni_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.portal_in = portal_in; + + rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, + sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) +{ + PtlPut_in args; + PtlPut_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.ack_req_in = ack_req_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + args.hdr_data_in = hdr_data_in; + + rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in) +{ + PtlGet_in args; + PtlGet_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + + rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c new file mode 100644 index 0000000..63ed70f --- /dev/null +++ b/lustre/portals/portals/lib-dispatch.c @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-dispatch.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +typedef struct { + int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); + char *name; +} dispatch_table_t; + +static dispatch_table_t dispatch_table[] = { + [PTL_GETID] {do_PtlGetId, "PtlGetId"}, + [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, + [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, + [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"}, + [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, + [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, + [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, + [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, + [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, + [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, + [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, + [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, + [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, + [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, + [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, + [PTL_ACENTRY] {do_PtlACEntry, "PtlACEntry"}, + [PTL_PUT] {do_PtlPut, "PtlPut"}, + [PTL_GET] {do_PtlGet, "PtlGet"}, + [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, + /* */ {0, ""} +}; + +/* + * This really should be elsewhere, but lib-p30/dispatch.c is + * an automatically generated file. + */ +void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, + void *ret_block) +{ + lib_ni_t *ni = &nal->ni; + + if (index < 0 || index > LIB_MAX_DISPATCH || + !dispatch_table[index].fun) { + CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); + return; + } + + CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, + dispatch_table[index].name, index); + + dispatch_table[index].fun(nal, private, arg_block, ret_block); +} + +char *dispatch_name(int index) +{ + return dispatch_table[index].name; +} diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c new file mode 100644 index 0000000..4c6c292 --- /dev/null +++ b/lustre/portals/portals/lib-eq.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-eq.c + * Library level Event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_size_t count_in + * void * base_in + * + * Outgoing: + * ptl_handle_eq_t * handle_out + */ + + PtlEQAlloc_in *args = v_args; + PtlEQAlloc_out *ret = v_ret; + + lib_eq_t *eq; + unsigned long flags; + + /* api should have rounded up */ + if (args->count_in != LOWEST_BIT_SET (args->count_in)) + return ret->rc = PTL_VAL_FAILED; + + eq = lib_eq_alloc (nal); + if (eq == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + if (nal->cb_map != NULL) { + struct iovec iov = { + .iov_base = args->base_in, + .iov_len = args->count_in * sizeof (ptl_event_t) }; + + ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); + if (ret->rc != PTL_OK) { + lib_eq_free (nal, eq); + + state_unlock (nal, &flags); + return (ret->rc); + } + } + + eq->sequence = 1; + eq->base = args->base_in; + eq->size = args->count_in; + eq->eq_refcount = 0; + eq->event_callback = args->callback_in; + + lib_initialise_handle (nal, &eq->eq_lh); + list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + + state_unlock(nal, &flags); + + ptl_eq2handle(&ret->handle_out, eq); + return (ret->rc = PTL_OK); +} + +int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_eq_t eventq_in + * + * Outgoing: + */ + + PtlEQFree_in *args = v_args; + PtlEQFree_out *ret = v_ret; + lib_eq_t *eq; + long flags; + + state_lock (nal, &flags); + + eq = ptl_handle2eq(&args->eventq_in, nal); + if (eq == NULL) { + ret->rc = PTL_INV_EQ; + } else if (eq->eq_refcount != 0) { + ret->rc = PTL_EQ_INUSE; + } else { + if (nal->cb_unmap != NULL) { + struct iovec iov = { + .iov_base = eq->base, + .iov_len = eq->size * sizeof (ptl_event_t) }; + + nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + } + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + ret->rc = PTL_OK; + } + + state_unlock (nal, &flags); + + return (ret->rc); +} diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c new file mode 100644 index 0000000..40f3d2c --- /dev/null +++ b/lustre/portals/portals/lib-init.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-init.c + * Start up the internal library and clear all structures + * Called by the NAL when it initializes. Safe to call multiple times. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +#ifdef __KERNEL__ +# include /* for memset() */ +# include +# ifdef KERNEL_ADDR_CACHE +# include +# endif +#else +# include +# include +#endif + +#ifdef PTL_USE_SLAB_CACHE +static int ptl_slab_users; + +kmem_cache_t *ptl_md_slab; +kmem_cache_t *ptl_msg_slab; +kmem_cache_t *ptl_me_slab; +kmem_cache_t *ptl_eq_slab; + +atomic_t md_in_use_count; +atomic_t msg_in_use_count; +atomic_t me_in_use_count; +atomic_t eq_in_use_count; + +/* NB zeroing in ctor and on freeing ensures items that + * kmem_cache_validate() OK, but haven't been initialised + * as an MD/ME/EQ can't have valid handles + */ +static void +ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_md_t)); +} + +static void +ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_me_t)); +} + +static void +ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_eq_t)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + + /* We'll have 1 set of slabs for ALL the nals :) */ + + if (ptl_slab_users++) + return 0; + + ptl_md_slab = kmem_cache_create("portals_MD", + sizeof(lib_md_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_md_slab_ctor, NULL); + if (!ptl_md_slab) { + CERROR("couldn't allocate ptl_md_t slab"); + RETURN (PTL_NOSPACE); + } + + /* NB no ctor for msgs; they don't need handle verification */ + ptl_msg_slab = kmem_cache_create("portals_MSG", + sizeof(lib_msg_t), 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ptl_msg_slab) { + CERROR("couldn't allocate ptl_msg_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_me_slab = kmem_cache_create("portals_ME", + sizeof(lib_me_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_me_slab_ctor, NULL); + if (!ptl_me_slab) { + CERROR("couldn't allocate ptl_me_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_eq_slab = kmem_cache_create("portals_EQ", + sizeof(lib_eq_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_eq_slab_ctor, NULL); + if (!ptl_eq_slab) { + CERROR("couldn't allocate ptl_eq_t slab"); + RETURN (PTL_NOSPACE); + } + + RETURN(PTL_OK); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + if (--ptl_slab_users != 0) + return; + + LASSERT (atomic_read (&md_in_use_count) == 0); + LASSERT (atomic_read (&me_in_use_count) == 0); + LASSERT (atomic_read (&eq_in_use_count) == 0); + LASSERT (atomic_read (&msg_in_use_count) == 0); + + if (ptl_md_slab != NULL) + kmem_cache_destroy(ptl_md_slab); + if (ptl_msg_slab != NULL) + kmem_cache_destroy(ptl_msg_slab); + if (ptl_me_slab != NULL) + kmem_cache_destroy(ptl_me_slab); + if (ptl_eq_slab != NULL) + kmem_cache_destroy(ptl_eq_slab); +} +#else + +int +lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lib_freeobj_t, fo_contents); + + space = nal->cb_malloc (nal, n * size); + if (space == NULL) + return (PTL_NOSPACE); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (PTL_OK); +} + +void +lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (fl)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + int rc; + + memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); + memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); + memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); + memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + MAX_MES, sizeof (lib_me_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + MAX_MSGS, sizeof (lib_msg_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + MAX_MDS, sizeof (lib_md_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + MAX_EQS, sizeof (lib_eq_t)); + return (rc); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + lib_freelist_fini (nal, &nal->ni.ni_free_mes); + lib_freelist_fini (nal, &nal->ni.ni_free_msgs); + lib_freelist_fini (nal, &nal->ni.ni_free_mds); + lib_freelist_fini (nal, &nal->ni.ni_free_eqs); +} + +#endif + +__u64 +lib_create_interface_cookie (nal_cb_t *nal) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid in a new instance of the same + * interface. Initialisation time, even if it's only implemented + * to millisecond resolution is probably easily good enough. */ + struct timeval tv; + __u64 cookie; +#ifndef __KERNEL__ + int rc = gettimeofday (&tv, NULL); + LASSERT (rc == 0); +#else + do_gettimeofday(&tv); +#endif + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return (cookie); +} + +int +lib_setup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + int i; + + /* Arbitrary choice of hash table size */ +#ifdef __KERNEL__ + ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); +#else + ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; +#endif + ni->ni_lh_hash_table = + (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size + * sizeof (struct list_head)); + if (ni->ni_lh_hash_table == NULL) + return (PTL_NOSPACE); + + for (i = 0; i < ni->ni_lh_hash_size; i++) + INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); + + ni->ni_next_object_cookie = 0; + + return (PTL_OK); +} + +void +lib_cleanup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->ni_lh_hash_table == NULL) + return; + + nal->cb_free (nal, ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); +} + +lib_handle_t * +lib_lookup_cookie (nal_cb_t *nal, __u64 cookie) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + struct list_head *list; + struct list_head *el; + unsigned int hash; + + hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; + list = &ni->ni_lh_hash_table[hash]; + + list_for_each (el, list) { + lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); + + if (lh->lh_cookie == cookie) + return (lh); + } + + return (NULL); +} + +void +lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + unsigned int hash; + + lh->lh_cookie = ni->ni_next_object_cookie++; + hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; + list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); +} + +void +lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + list_del (&lh->lh_hash_chain); +} + +int +lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size) +{ + int rc = PTL_OK; + lib_ni_t *ni = &nal->ni; + int i; + ENTRY; + + /* NB serialised in PtlNIInit() */ + + if (ni->refcnt != 0) { /* already initialised */ + ni->refcnt++; + goto out; + } + + /* + * Allocate the portal table for this interface + * and all per-interface objects. + */ + memset(&ni->counters, 0, sizeof(lib_counters_t)); + + rc = kportal_descriptor_setup (nal); + if (rc != PTL_OK) + goto out; + + INIT_LIST_HEAD (&ni->ni_active_msgs); + INIT_LIST_HEAD (&ni->ni_active_mds); + INIT_LIST_HEAD (&ni->ni_active_eqs); + + INIT_LIST_HEAD (&ni->ni_test_peers); + + ni->ni_interface_cookie = lib_create_interface_cookie (nal); + ni->ni_next_object_cookie = 0; + rc = lib_setup_handle_hash (nal); + if (rc != PTL_OK) + goto out; + + ni->nid = nid; + ni->pid = pid; + + ni->num_nodes = gsize; + ni->tbl.size = ptl_size; + + ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); + if (ni->tbl.tbl == NULL) { + rc = PTL_NOSPACE; + goto out; + } + + for (i = 0; i < ptl_size; i++) + INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + + ni->debug = PTL_DEBUG_NONE; + ni->up = 1; + ni->refcnt++; + + out: + if (rc != PTL_OK) { + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + } + + RETURN (rc); +} + +int +lib_fini(nal_cb_t * nal) +{ + lib_ni_t *ni = &nal->ni; + int idx; + + ni->refcnt--; + + if (ni->refcnt != 0) + goto out; + + /* NB no stat_lock() since this is the last reference. The NAL + * should have shut down already, so it should be safe to unlink + * and free all descriptors, even those that appear committed to a + * network op (eg MD with non-zero pending count) + */ + + for (idx = 0; idx < ni->tbl.size; idx++) + while (!list_empty (&ni->tbl.tbl[idx])) { + lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + lib_me_t, me_list); + + CERROR ("Active me %p on exit\n", me); + list_del (&me->me_list); + lib_me_free (nal, me); + } + + while (!list_empty (&ni->ni_active_mds)) { + lib_md_t *md = list_entry (ni->ni_active_mds.next, + lib_md_t, md_list); + + CERROR ("Active md %p on exit\n", md); + list_del (&md->md_list); + lib_md_free (nal, md); + } + + while (!list_empty (&ni->ni_active_eqs)) { + lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, + lib_eq_t, eq_list); + + CERROR ("Active eq %p on exit\n", eq); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + } + + while (!list_empty (&ni->ni_active_msgs)) { + lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, + lib_msg_t, msg_list); + + CERROR ("Active msg %p on exit\n", msg); + list_del (&msg->msg_list); + lib_msg_free (nal, msg); + } + + nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + ni->up = 0; + + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + + out: + return (PTL_OK); +} diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c new file mode 100644 index 0000000..d171050 --- /dev/null +++ b/lustre/portals/portals/lib-md.c @@ -0,0 +1,412 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-md.c + * Memory Descriptor management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +/* + * must be called with state lock held + */ +void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +{ + lib_me_t *me = md->me; + + if (md->pending != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + md->md_flags |= PTL_MD_FLAG_UNLINK; + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if ((md->options & PTL_MD_KIOV) != 0) { + if (nal->cb_unmap_pages != NULL) + nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->cb_unmap != NULL) + nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, + &md->md_addrkey); + + if (me) { + me->md = NULL; + if (me->unlink == PTL_UNLINK) + lib_me_unlink(nal, me); + } + + if (md->eq != NULL) + { + md->eq->eq_refcount--; + LASSERT (md->eq->eq_refcount >= 0); + } + + lib_invalidate_handle (nal, &md->md_lh); + list_del (&md->md_list); + lib_md_free(nal, md); +} + +/* must be called with state lock held */ +static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, + ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +{ + const int max_size_opts = PTL_MD_AUTO_UNLINK | + PTL_MD_MAX_SIZE; + lib_eq_t *eq = NULL; + int rc; + int i; + + /* NB we are passes an allocated, but uninitialised/active md. + * if we return success, caller may lib_md_unlink() it. + * otherwise caller may only lib_md_free() it. + */ + + if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) { + eq = ptl_handle2eq(eqh, nal); + if (eq == NULL) + return PTL_INV_EQ; + } + + if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ + md->niov > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_TOO_MANY; + + if ((md->options & max_size_opts) != 0 && /* max size used */ + (md->max_size < 0 || md->max_size > md->length)) // illegal max_size + return PTL_INV_MD; + + new->me = NULL; + new->start = md->start; + new->length = md->length; + new->offset = 0; + new->max_size = md->max_size; + new->unlink = unlink; + new->options = md->options; + new->user_ptr = md->user_ptr; + new->eq = eq; + new->threshold = md->threshold; + new->pending = 0; + new->md_flags = 0; + + if ((md->options & PTL_MD_IOV) != 0) { + int total_length = 0; + + if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.iov, md->start, + md->niov * sizeof (new->md_iov.iov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the base address on trust */ + if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_VAL_FAILED; + + total_length += new->md_iov.iov[i].iov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } else if ((md->options & PTL_MD_KIOV) != 0) { +#ifndef __KERNEL__ + return PTL_INV_MD; +#else + int total_length = 0; + + /* Trap attempt to use paged I/O if unsupported early. */ + if (nal->cb_send_pages == NULL || + nal->cb_recv_pages == NULL) + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, + md->niov * sizeof (new->md_iov.kiov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the page pointer on trust */ + if (new->md_iov.kiov[i].kiov_offset + + new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + return PTL_VAL_FAILED; /* invalid length */ + + total_length += new->md_iov.kiov[i].kiov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map_pages != NULL) { + rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } +#endif + } else { /* contiguous */ + new->md_niov = 1; + new->md_iov.iov[0].iov_base = md->start; + new->md_iov.iov[0].iov_len = md->length; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } + + if (eq != NULL) + eq->eq_refcount++; + + /* It's good; let handle2md succeed and add to active mds */ + lib_initialise_handle (nal, &new->md_lh); + list_add (&new->md_list, &nal->ni.ni_active_mds); + + return PTL_OK; +} + +/* must be called with state lock held */ +void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + new->start = md->start; + new->length = md->length; + new->threshold = md->threshold; + new->max_size = md->max_size; + new->options = md->options; + new->user_ptr = md->user_ptr; + ptl_eq2handle(&new->eventq, md->eq); + new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov; +} + +int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_me_t current_in + * ptl_md_t md_in + * ptl_unlink_t unlink_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDAttach_in *args = v_args; + PtlMDAttach_out *ret = v_ret; + lib_me_t *me; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->me_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else if (me->md != NULL) { + ret->rc = PTL_INUSE; + } else { + ret->rc = lib_md_build(nal, md, private, &args->md_in, + &args->eq_in, args->unlink_in); + + if (ret->rc == PTL_OK) { + me->md = md; + md->me = me; + + ptl_md2handle(&ret->handle_out, md); + + state_unlock (nal, &flags); + return (PTL_OK); + } + } + + lib_md_free (nal, md); + + state_unlock (nal, &flags); + return (ret->rc); +} + +int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_md_t md_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDBind_in *args = v_args; + PtlMDBind_out *ret = v_ret; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + ret->rc = lib_md_build(nal, md, private, + &args->md_in, &args->eq_in, PTL_UNLINK); + + if (ret->rc == PTL_OK) { + ptl_md2handle(&ret->handle_out, md); + + state_unlock(nal, &flags); + return (PTL_OK); + } + + lib_md_free (nal, md); + + state_unlock(nal, &flags); + return (ret->rc); +} + +int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMDUnlink_in *args = v_args; + PtlMDUnlink_out *ret = v_ret; + + lib_md_t *md; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + } else if (md->pending != 0) { /* being filled/spilled */ + ret->rc = PTL_MD_INUSE; + } else { + /* Callers attempting to unlink a busy MD which will get + * unlinked once the net op completes should see INUSE, + * before completion and INV_MD thereafter. LASSERT we've + * got that right... */ + LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); + + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_md_t * old_inout + * ptl_md_t * new_inout + * ptl_handle_eq_t testq_in + * ptl_seq_t sequence_in + * + * Outgoing: + * ptl_md_t * old_inout + * ptl_md_t * new_inout + */ + PtlMDUpdate_internal_in *args = v_args; + PtlMDUpdate_internal_out *ret = v_ret; + lib_md_t *md; + lib_eq_t *test_eq = NULL; + ptl_md_t *new = &args->new_inout; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (args->old_inout_valid) + lib_md_deconstruct(nal, md, &ret->old_inout); + + if (!args->new_inout_valid) { + ret->rc = PTL_OK; + goto out; + } + + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(&args->testq_in, nal); + if (test_eq == NULL) { + ret->rc = PTL_INV_EQ; + goto out; + } + } + + if (md->pending != 0) { + ret->rc = PTL_NOUPDATE; + goto out; + } + + if (test_eq == NULL || + test_eq->sequence == args->sequence_in) { + lib_me_t *me = md->me; + +#warning this does not track eq refcounts properly + + ret->rc = lib_md_build(nal, md, private, + new, &new->eventq, md->unlink); + + md->me = me; + } else { + ret->rc = PTL_NOUPDATE; + } + + out: + state_unlock(nal, &flags); + return (ret->rc); +} diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c new file mode 100644 index 0000000..34fb606 --- /dev/null +++ b/lustre/portals/portals/lib-me.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-me.c + * Match Entry management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); + +int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEAttach_in *args = v_args; + PtlMEAttach_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_ptl_t *tbl = &ni->tbl; + unsigned long flags; + lib_me_t *me; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + /* Should check for valid matchid, but not yet */ + if (0) + return ret->rc = PTL_INV_PROC; + + me = lib_me_alloc (nal); + if (me == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me->match_id = args->match_id_in; + me->match_bits = args->match_bits_in; + me->ignore_bits = args->ignore_bits_in; + me->unlink = args->unlink_in; + me->md = NULL; + + lib_initialise_handle (nal, &me->me_lh); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + else + list_add(&me->me_list, &(tbl->tbl[args->index_in])); + + ptl_me2handle(&ret->handle_out, me); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEInsert_in *args = v_args; + PtlMEInsert_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + lib_me_t *new; + + new = lib_me_alloc (nal); + if (new == NULL) + return (ret->rc = PTL_NOSPACE); + + /* Should check for valid matchid, but not yet */ + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + lib_me_free (nal, new); + + state_unlock (nal, &flags); + return (ret->rc = PTL_INV_ME); + } + + new->match_id = args->match_id_in; + new->match_bits = args->match_bits_in; + new->ignore_bits = args->ignore_bits_in; + new->unlink = args->unlink_in; + new->md = NULL; + + lib_initialise_handle (nal, &new->me_lh); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&new->me_list, &me->me_list); + else + list_add(&new->me_list, &me->me_list); + + ptl_me2handle(&ret->handle_out, new); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEUnlink_in *args = v_args; + PtlMEUnlink_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_unlink(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +/* call with state_lock please */ +void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->debug & PTL_DEBUG_UNLINK) { + ptl_handle_any_t handle; + ptl_me2handle(&handle, me); + } + + list_del (&me->me_list); + + if (me->md) { + me->md->me = NULL; + lib_md_unlink(nal, me->md); + } + + lib_invalidate_handle (nal, &me->me_lh); + lib_me_free(nal, me); +} + +int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlTblDump_in *args = v_args; + PtlTblDump_out *ret = v_ret; + lib_ptl_t *tbl = &nal->ni.tbl; + ptl_handle_any_t handle; + struct list_head *tmp; + unsigned long flags; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + nal->cb_printf(nal, "Portal table index %d\n", args->index_in); + + state_lock(nal, &flags); + list_for_each(tmp, &(tbl->tbl[args->index_in])) { + lib_me_t *me = list_entry(tmp, lib_me_t, me_list); + ptl_me2handle(&handle, me); + lib_me_dump(nal, me); + } + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEDump_in *args = v_args; + PtlMEDump_out *ret = v_ret; + lib_me_t *me; + unsigned long flags; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_dump(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return ret->rc; +} + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) +{ + nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); + + nal->cb_printf(nal, "\tMD\t= %p\n", me->md); + nal->cb_printf(nal, "\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + nal->cb_printf(nal, "\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); +} diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c new file mode 100644 index 0000000..7ba1664 --- /dev/null +++ b/lustre/portals/portals/lib-move.c @@ -0,0 +1,1287 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif +#include +#include +#include + +/* + * Right now it does not check access control lists. + * + * We only support one MD per ME, which is how the Portals 3.1 spec is written. + * All previous complication is removed. + */ + +static lib_me_t * +lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, + ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, + ptl_match_bits_t match_bits, ptl_size_t *mlength_out, + ptl_size_t *offset_out, int *unlink_out) +{ + lib_ni_t *ni = &nal->ni; + struct list_head *match_list = &ni->tbl.tbl[index]; + struct list_head *tmp; + lib_me_t *me; + lib_md_t *md; + ptl_size_t mlength; + ptl_size_t offset; + + ENTRY; + + CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " + "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); + + if (index < 0 || index >= ni->tbl.size) { + CERROR("Invalid portal %d not in [0-%d]\n", + index, ni->tbl.size); + goto failed; + } + + list_for_each (tmp, match_list) { + me = list_entry(tmp, lib_me_t, me_list); + md = me->md; + + /* ME attached but MD not attached yet */ + if (md == NULL) + continue; + + LASSERT (me == md->me); + + /* MD deactivated */ + if (md->threshold == 0) + continue; + + /* mismatched MD op */ + if ((md->options & op_mask) == 0) + continue; + + /* mismatched ME nid/pid? */ + if (me->match_id.nid != PTL_NID_ANY && + me->match_id.nid != src_nid) + continue; + + if (me->match_id.pid != PTL_PID_ANY && + me->match_id.pid != src_pid) + continue; + + /* mismatched ME matchbits? */ + if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) + continue; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) + offset = md->offset; + else + offset = roffset; + + mlength = md->length - offset; + if ((md->options & PTL_MD_MAX_SIZE) != 0 && + mlength > md->max_size) + mlength = md->max_size; + + if (rlength <= mlength) { /* fits in allowed space */ + mlength = rlength; + } else if ((md->options & PTL_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet %d too big: %d left, " + "%d allowed\n", rlength, md->length - offset, + mlength); + goto failed; + } + + md->offset = offset + mlength; + + *offset_out = offset; + *mlength_out = mlength; + *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 && + md->offset >= (md->length - md->max_size)); + RETURN (me); + } + + failed: + CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 + " offset %d length %d: no match\n", + ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + src_nid, src_pid, index, match_bits, roffset, rlength); + RETURN(NULL); +} + +int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +{ + PtlFailNid_in *args = v_args; + PtlFailNid_out *ret = v_ret; + lib_test_peer_t *tp; + unsigned long flags; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + if (args->threshold != 0) { + /* Adding a new entry */ + tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + if (tp == NULL) + return (ret->rc = PTL_FAIL); + + tp->tp_nid = args->nid; + tp->tp_threshold = args->threshold; + + state_lock (nal, &flags); + list_add (&tp->tp_list, &nal->ni.ni_test_peers); + state_unlock (nal, &flags); + return (ret->rc = PTL_OK); + } + + /* removing entries */ + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + args->nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == args->nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + + list_del (&tp->tp_list); + nal->cb_free (nal, tp, sizeof (*tp)); + } + return (ret->rc = PTL_OK); +} + +static int +fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +{ + lib_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + unsigned long flags; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != PTL_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + list_del (&tp->tp_list); + + nal->cb_free (nal, tp, sizeof (*tp)); + } + + return (fail); +} + +ptl_size_t +lib_iov_nob (int niov, struct iovec *iov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} + +void +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (dest, iov->iov_base, nob); + + len -= nob; + dest += nob; + niov--; + iov++; + } +} + +void +lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (iov->iov_base, src, nob); + + len -= nob; + src += nob; + niov--; + iov++; + } +} + +static int +lib_extract_iov (struct iovec *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + struct iovec *src = md->md_iov.iov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (dst_niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} + +#ifndef __KERNEL__ +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + LASSERT (0); + return (0); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + LASSERT (0); +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +{ + LASSERT (0); +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + LASSERT (0); +} + +#else + +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (dest, addr, nob); + kunmap (kiov->kiov_page); + + len -= nob; + dest += nob; + niov--; + kiov++; + } +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (addr, src, nob); + kunmap (kiov->kiov_page); + + len -= nob; + src += nob; + niov--; + kiov++; + } +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + ptl_kiov_t *src = md->md_iov.kiov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + return (dst_niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} +#endif + +void +lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +{ + int niov; + + if (mlen == 0) + nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); + else if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); + nal->cb_recv (nal, private, msg, + niov, msg->msg_iov.iov, mlen, rlen); + } else { + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); + nal->cb_recv_pages (nal, private, msg, + niov, msg->msg_iov.kiov, mlen, rlen); + } +} + +int +lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len) +{ + int niov; + + if (len == 0) + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + 0, NULL, 0)); + + if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.iov, len)); + } + + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); + return (nal->cb_send_pages (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.kiov, len)); +} + +static lib_msg_t * +get_new_msg (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called holding the state_lock */ + lib_counters_t *counters = &nal->ni.counters; + lib_msg_t *msg = lib_msg_alloc (nal); + + if (msg == NULL) + return (NULL); + + memset (msg, 0, sizeof (*msg)); + + msg->send_ack = 0; + + msg->md = md; + msg->ev.arrival_time = get_cycles(); + md->pending++; + if (md->threshold != PTL_MD_THRESH_INF) { + LASSERT (md->threshold > 0); + md->threshold--; + } + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; + + list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + + return (msg); +} + + +/* + * Incoming messages have a ptl_msg_t object associated with them + * by the library. This object encapsulates the state of the + * message and allows the NAL to do non-blocking receives or sends + * of long messages. + * + */ +static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + unsigned long flags; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); + hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, + hdr->src_nid, hdr->src_pid, + PTL_HDR_LENGTH (hdr), hdr->msg.put.offset, + hdr->msg.put.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " + "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + !(md->options & PTL_MD_ACK_DISABLE)) { + msg->send_ack = 1; + msg->ack_wmd = hdr->msg.put.ack_wmd; + msg->nid = hdr->src_nid; + msg->pid = hdr->src_pid; + msg->ev.match_bits = hdr->msg.put.match_bits; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += mlength; + + /* only unlink after MD's pending count has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + ptl_hdr_t reply; + unsigned long flags; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); + hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); + + /* compatibility check until field is deleted */ + if (hdr->msg.get.return_offset != 0) + CERROR("Unexpected non-zero get.return_offset %x from " + LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, + hdr->src_nid, hdr->src_pid, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " + "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.send_count++; + ni->counters.send_length += mlength; + + /* only unlink after MD's refcount has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + memset (&reply, 0, sizeof (reply)); + reply.type = HTON__u32 (PTL_MSG_REPLY); + reply.dest_nid = HTON__u64 (hdr->src_nid); + reply.src_nid = HTON__u64 (ni->nid); + reply.dest_pid = HTON__u32 (hdr->src_pid); + reply.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength); + + reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + + rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, + hdr->src_nid, hdr->src_pid, md, offset, mlength); + if (rc != 0) { + CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", + ni->nid, hdr->src_nid); + state_lock (nal, &flags); + goto drop; + } + + /* Complete the incoming message */ + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (rc); + drop: + ni->counters.drop_count++; + ni->counters.drop_length += hdr->msg.get.sink_length; + state_unlock(nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + int rlength; + int length; + lib_msg_t *msg; + unsigned long flags; + + /* compatibility check until field is deleted */ + if (hdr->msg.reply.dst_offset != 0) + CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n", + hdr->msg.reply.dst_offset, hdr->src_nid); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", + ni->nid, hdr->src_nid, + md == NULL ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + goto drop; + } + + LASSERT (md->offset == 0); + + length = rlength = PTL_HDR_LENGTH(hdr); + + if (length > md->length) { + if ((md->options & PTL_MD_TRUNCATE) == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64 + " length %d for MD "LPX64" would overflow (%d)\n", + ni->nid, hdr->src_nid, length, + hdr->msg.reply.dst_wmd.wh_object_cookie, + md->length); + goto drop; + } + length = md->length; + } + + CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", + hdr->src_nid, length, rlength, + hdr->msg.reply.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping REPLY from "LPU64": can't " + "allocate msg\n", ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += length; + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, 0, length, rlength); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + unsigned long flags; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR(LPU64": Dropping ACK from "LPU64" to %s MD " + LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + goto drop; + } + + CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", + ni->nid, hdr->src_nid, + hdr->msg.ack.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + state_unlock(nal, &flags); + lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + case PTL_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} + +void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); + nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, + hdr->src_pid); + nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, + hdr->dest_pid); + + switch (hdr->type) { + default: + break; + + case PTL_MSG_PUT: + nal->cb_printf(nal, + " Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + nal->cb_printf(nal, + " Length %d, offset %d, hdr data "LPX64"\n", + PTL_HDR_LENGTH(hdr), hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + nal->cb_printf(nal, + " Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + nal->cb_printf(nal, + " Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case PTL_MSG_ACK: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case PTL_MSG_REPLY: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + PTL_HDR_LENGTH(hdr)); + } + +} /* end of print_hdr() */ + + +int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + unsigned long flags; + + /* NB static check; optimizer will elide this if it's right */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.put.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.get.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.reply.length)); + + /* convert common fields to host byte order */ + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->src_nid = NTOH__u64 (hdr->src_nid); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); + hdr->src_pid = NTOH__u32 (hdr->src_pid); + hdr->type = NTOH__u32 (hdr->type); + PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr)); +#if 0 + nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", + nal->ni.nid, nal, hdr, hdr->type); + print_hdr(nal, hdr); +#endif + if (hdr->type == PTL_MSG_HELLO) { + /* dest_nid is really ptl_magicversion_t */ + ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; + + CERROR (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->ni.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (hdr->dest_nid != nal->ni.nid) { + CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 + " (not me)\n", nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + + state_lock (nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ + { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": simulated failure\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + return (-1); + } + + switch (hdr->type) { + case PTL_MSG_ACK: + return (parse_ack(nal, hdr, private)); + case PTL_MSG_PUT: + return (parse_put(nal, hdr, private)); + break; + case PTL_MSG_GET: + return (parse_get(nal, hdr, private)); + break; + case PTL_MSG_REPLY: + return (parse_reply(nal, hdr, private)); + break; + default: + CERROR(LPU64": Dropping message from "LPU64 + ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, + hdr->type); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } +} + + +int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_ack_req_t ack_req_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlPut_in *args = v_args; + PtlPut_out *ret = v_ret; + ptl_hdr_t hdr; + + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + ptl_process_id_t *id = &args->target_in; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + ret->rc = PTL_OK; + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_PUT); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length); + + /* NB handles only looked up by creator (no flips) */ + if (args->ack_req_in == PTL_ACK_REQ) { + hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; + } else { + hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; + } + + hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.put.offset = HTON__u32 (args->offset_in); + hdr.msg.put.hdr_data = args->hdr_data_in; + + ni->counters.send_count++; + ni->counters.send_length += md->length; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("BAD: could not allocate msg!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we need to allocate a message state object and record the + * information about this operation that will be recorded into + * event queue once the message has been completed. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + + return ret->rc = PTL_OK; +} + + +int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlGet_in *args = v_args; + PtlGet_out *ret = v_ret; + ptl_hdr_t hdr; + lib_msg_t *msg = NULL; + lib_ni_t *ni = &nal->ni; + ptl_process_id_t *id = &args->target_in; + lib_md_t *md; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + LASSERT (md->offset == 0); + + CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_GET); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = 0; + + /* NB handles only looked up by creator (no flips) */ + hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; + + hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.sink_length = HTON__u32 (md->length); + + ni->counters.send_count++; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we must allocate a message state object that will record + * the information to be filled in once the message has been + * completed. More information is in the do_PtlPut() comments. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + + return ret->rc = PTL_OK; +} diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c new file mode 100644 index 0000000..20a6c66 --- /dev/null +++ b/lustre/portals/portals/lib-msg.c @@ -0,0 +1,163 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-msg.c + * Message decoding, parsing and finalizing routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include + +int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +{ + lib_md_t *md; + lib_eq_t *eq; + int rc; + unsigned long flags; + + /* ni went down while processing this message */ + if (nal->ni.up == 0) { + return -1; + } + + if (msg == NULL) + return 0; + + rc = 0; + if (msg->send_ack) { + ptl_hdr_t ack; + + LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + + memset (&ack, 0, sizeof (ack)); + ack.type = HTON__u32 (PTL_MSG_ACK); + ack.dest_nid = HTON__u64 (msg->nid); + ack.src_nid = HTON__u64 (nal->ni.nid); + ack.dest_pid = HTON__u32 (msg->pid); + ack.src_pid = HTON__u32 (nal->ni.pid); + PTL_HDR_LENGTH(&ack) = 0; + + ack.msg.ack.dst_wmd = msg->ack_wmd; + ack.msg.ack.match_bits = msg->ev.match_bits; + ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); + + rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, + msg->nid, msg->pid, NULL, 0, 0); + } + + md = msg->md; + LASSERT (md->pending > 0); /* I've not dropped my ref yet */ + eq = md->eq; + + state_lock(nal, &flags); + + if (eq != NULL) { + ptl_event_t *ev = &msg->ev; + ptl_event_t *eq_slot; + + /* I have to hold the lock while I bump the sequence number + * and copy the event into the queue. If not, and I was + * interrupted after bumping the sequence number, other + * events could fill the queue, including the slot I just + * allocated to this event. On resuming, I would overwrite + * a more 'recent' event with old event state, and + * processes taking events off the queue would not detect + * overflow correctly. + */ + + ev->sequence = eq->sequence++;/* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Invalidate unlinked_me unless this is the last + * event for an auto-unlinked MD. Note that if md was + * auto-unlinked, md->pending can only decrease + */ + if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ + md->pending != 1) /* not last ref */ + ev->unlinked_me = PTL_HANDLE_NONE; + + /* Copy the event into the allocated slot, ensuring all the + * rest of the event's contents have been copied _before_ + * the sequence number gets updated. A processes 'getting' + * an event waits on the next queue slot's sequence to be + * 'new'. When it is, _all_ other event fields had better + * be consistent. I assert 'sequence' is the last member, + * so I only need a 2 stage copy. + */ + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' */ + + /* cb_write is not necessarily atomic, so this could + cause a race with PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + + /* I must also ensure that (a) callbacks are made in the + * same order as the events land in the queue, and (b) the + * callback occurs before the event can be removed from the + * queue, so I can't drop the lock during the callback. */ + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + (void)((eq->event_callback) (ev)); + } + + LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + + md->pending--; + if (md->pending == 0 && /* no more outstanding operations on this md */ + (md->threshold == 0 || /* done its business */ + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + lib_md_unlink(nal, md); + + list_del (&msg->msg_list); + nal->ni.counters.msgs_alloc--; + lib_msg_free(nal, msg); + + state_unlock(nal, &flags); + + return rc; +} diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c new file mode 100644 index 0000000..37dcb91 --- /dev/null +++ b/lustre/portals/portals/lib-ni.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-ni.c + * Network status registers and distance functions. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +#define MAX_DIST 18446744073709551615UL + +int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlNIDebug_in *args = v_args; + PtlNIDebug_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->rc = ni->debug; + ni->debug = args->mask_in; + + return 0; +} + +int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_sr_index_t register_in + * + * Outgoing: + * ptl_sr_value_t * status_out + */ + + PtlNIStatus_in *args = v_args; + PtlNIStatus_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_counters_t *count = &ni->counters; + + if (!args) + return ret->rc = PTL_SEGV; + + ret->rc = PTL_OK; + ret->status_out = 0; + + /* + * I hate this sort of code.... Hash tables, offset lists? + * Treat the counters as an array of ints? + */ + if (args->register_in == PTL_SR_DROP_COUNT) + ret->status_out = count->drop_count; + + else if (args->register_in == PTL_SR_DROP_LENGTH) + ret->status_out = count->drop_length; + + else if (args->register_in == PTL_SR_RECV_COUNT) + ret->status_out = count->recv_count; + + else if (args->register_in == PTL_SR_RECV_LENGTH) + ret->status_out = count->recv_length; + + else if (args->register_in == PTL_SR_SEND_COUNT) + ret->status_out = count->send_count; + + else if (args->register_in == PTL_SR_SEND_LENGTH) + ret->status_out = count->send_length; + + else if (args->register_in == PTL_SR_MSGS_MAX) + ret->status_out = count->msgs_max; + else + ret->rc = PTL_INV_SR_INDX; + + return ret->rc; +} + + +int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_process_id_t process_in + + * + * Outgoing: + * unsigned long * distance_out + + */ + + PtlNIDist_in *args = v_args; + PtlNIDist_out *ret = v_ret; + + unsigned long dist; + ptl_process_id_t id_in = args->process_in; + ptl_nid_t nid; + int rc; + + nid = id_in.nid; + + if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { + ret->distance_out = (unsigned long) MAX_DIST; + return PTL_INV_PROC; + } + + ret->distance_out = dist; + + return ret->rc = PTL_OK; +} diff --git a/lustre/portals/portals/lib-not-impl.c b/lustre/portals/portals/lib-not-impl.c new file mode 100644 index 0000000..78959b2 --- /dev/null +++ b/lustre/portals/portals/lib-not-impl.c @@ -0,0 +1,37 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-not-impl.c + * + * boiler plate functions that can be used to write the + * library side routines + */ + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + + +int do_PtlACEntry(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_ac_index_t index_in + * ptl_process_id_t match_id_in + * ptl_pt_index_t portal_in + + * + * Outgoing: + + */ + + PtlACEntry_in *args = v_args; + PtlACEntry_out *ret = v_ret; + + if (!args) + return ret->rc = PTL_SEGV; + + return ret->rc = PTL_NOT_IMPLEMENTED; +} diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c new file mode 100644 index 0000000..e00e9f0 --- /dev/null +++ b/lustre/portals/portals/lib-pid.c @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-pid.c + * Process identification routines + */ + +/* This should be removed. The NAL should have the PID information */ +#define DEBUG_SUBSYSTEM S_PORTALS + +#if defined (__KERNEL__) +# include +extern int getpid(void); +#else +# include +# include +#endif +#include +#include + +int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t handle_in + * + * Outgoing: + * ptl_process_id_t * id_out + * ptl_id_t * gsize_out + */ + + PtlGetId_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->id_out.nid = ni->nid; + ret->id_out.pid = ni->pid; + + return ret->rc = PTL_OK; +} diff --git a/lustre/portals/router/Makefile.am b/lustre/portals/router/Makefile.am new file mode 100644 index 0000000..1c8087b --- /dev/null +++ b/lustre/portals/router/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +MODULE = kptlrouter +modulenet_DATA = kptlrouter.o +EXTRA_PROGRAMS = kptlrouter + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kptlrouter_SOURCES = router.c proc.c router.h diff --git a/lustre/portals/router/Makefile.mk b/lustre/portals/router/Makefile.mk new file mode 100644 index 0000000..64bd09b --- /dev/null +++ b/lustre/portals/router/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += kptlrouter.o +kptlrouter-objs := router.o proc.o diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c new file mode 100644 index 0000000..dd65b34 --- /dev/null +++ b/lustre/portals/router/proc.c @@ -0,0 +1,78 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +#define KPR_PROC_ROUTER "sys/portals/router" + +int +kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data) +{ + unsigned long long bytes = kpr_fwd_bytes; + unsigned long packets = kpr_fwd_packets; + unsigned long errors = kpr_fwd_errors; + unsigned int qdepth = atomic_read (&kpr_queue_depth); + int len; + + *eof = 1; + if (off != 0) + return (0); + + len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); + + *start = page; + return (len); +} + +int +kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data) +{ + /* Ignore what we've been asked to write, and just zero the stats counters */ + kpr_fwd_bytes = 0; + kpr_fwd_packets = 0; + kpr_fwd_errors = 0; + + return (count); +} + +void +kpr_proc_init(void) +{ + struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL); + + if (entry == NULL) + { + CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); + return; + } + + entry->data = NULL; + entry->read_proc = kpr_proc_read; + entry->write_proc = kpr_proc_write; +} + +void +kpr_proc_fini(void) +{ + remove_proc_entry(KPR_PROC_ROUTER, 0); +} diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c new file mode 100644 index 0000000..8a1de08 --- /dev/null +++ b/lustre/portals/router/router.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +struct list_head kpr_routes; +struct list_head kpr_nals; + +unsigned long long kpr_fwd_bytes; +unsigned long kpr_fwd_packets; +unsigned long kpr_fwd_errors; +atomic_t kpr_queue_depth; + +/* Mostly the tables are read-only (thread and interrupt context) + * + * Once in a blue moon we register/deregister NALs and add/remove routing + * entries (thread context only)... */ +rwlock_t kpr_rwlock; + +kpr_router_interface_t kpr_router_interface = { + kprri_register: kpr_register_nal, + kprri_lookup: kpr_lookup_target, + kprri_fwd_start: kpr_forward_packet, + kprri_fwd_done: kpr_complete_packet, + kprri_shutdown: kpr_shutdown_nal, + kprri_deregister: kpr_deregister_nal, +}; + +kpr_control_interface_t kpr_control_interface = { + kprci_add_route: kpr_add_route, + kprci_del_route: kpr_del_route, + kprci_get_route: kpr_get_route, +}; + +int +kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) +{ + long flags; + struct list_head *e; + kpr_nal_entry_t *ne; + + CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + + PORTAL_ALLOC (ne, sizeof (*ne)); + if (ne == NULL) + return (-ENOMEM); + + memset (ne, 0, sizeof (*ne)); + memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); + + LASSERT (!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) + { + write_unlock_irqrestore (&kpr_rwlock, flags); + + CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid); + + PORTAL_FREE (ne, sizeof (*ne)); + return (-EEXIST); + } + } + + list_add (&ne->kpne_list, &kpr_nals); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + *argp = ne; + PORTAL_MODULE_USE; + return (0); +} + +void +kpr_shutdown_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (!ne->kpne_shutdown); + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */ + ne->kpne_shutdown = 1; + write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */ + + while (atomic_read (&ne->kpne_refcount) != 0) + { + CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", + ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); + + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } +} + +void +kpr_deregister_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ + LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); + + list_del (&ne->kpne_list); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (ne, sizeof (*ne)); + PORTAL_MODULE_UNUSE; +} + + +int +kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + int rc = -ENOENT; + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + + if (ne->kpne_shutdown) /* caller is shutting down */ + return (-ENOENT); + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid on the callers network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || + re->kpre_hi_nid < target_nid) + continue; + + /* found table entry */ + + if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ + rc = -EHOSTUNREACH; + else + { + rc = 0; + *gateway_nidp = re->kpre_gateway_nid; + } + break; + } + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + target_nid, ne->kpne_interface.kprni_nalid, rc, + (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); + return (rc); +} + +void +kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + struct list_head *e; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ + LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + + atomic_inc (&kpr_queue_depth); + + kpr_fwd_packets++; /* (loose) stats accounting */ + kpr_fwd_bytes += nob; + + if (src_ne->kpne_shutdown) /* caller is shutting down */ + goto out; + + fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ + atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || /* no match */ + re->kpre_hi_nid < target_nid) + continue; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + re->kpre_gateway_nid, re->kpre_gateway_nalid); + + if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) + break; /* don't route to same NAL */ + + /* Search for gateway's NAL's entry */ + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ + break; + + fwd->kprfd_gateway_nid = re->kpre_gateway_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; + } + break; + } + + read_unlock (&kpr_rwlock); + out: + kpr_fwd_errors++; + + CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + /* Can't find anywhere to forward to */ + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); +} + +void +kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) +{ + kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; + + CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ + + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); + + CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ +} + +int +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + long flags; + struct list_head *e; + kpr_route_entry_t *re; + + CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + gateway_nalid, gateway_nid, lo_nid, hi_nid); + + LASSERT(lo_nid <= hi_nid); + + PORTAL_ALLOC (re, sizeof (*re)); + if (re == NULL) + return (-ENOMEM); + + re->kpre_gateway_nalid = gateway_nalid; + re->kpre_gateway_nid = gateway_nid; + re->kpre_lo_nid = lo_nid; + re->kpre_hi_nid = hi_nid; + + LASSERT(!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > re2->kpre_hi_nid || + re->kpre_hi_nid < re2->kpre_lo_nid) + continue; + + CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" + "to ["LPX64" - "LPX64"]\n", + re->kpre_lo_nid, re->kpre_hi_nid, + re2->kpre_lo_nid, re2->kpre_hi_nid); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (re, sizeof (*re)); + return (-EINVAL); + } + + list_add (&re->kpre_list, &kpr_routes); + + write_unlock_irqrestore (&kpr_rwlock, flags); + return (0); +} + +int +kpr_del_route (ptl_nid_t nid) +{ + long flags; + struct list_head *e; + + CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + + LASSERT(!in_interrupt()); + write_lock_irqsave(&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + continue; + + list_del (&re->kpre_list); + write_unlock_irqrestore(&kpr_rwlock, flags); + + PORTAL_FREE(re, sizeof (*re)); + return (0); + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + return (-ENOENT); +} + +int +kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +{ + struct list_head *e; + + read_lock(&kpr_rwlock); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (idx-- == 0) { + *gateway_nalid = re->kpre_gateway_nalid; + *gateway_nid = re->kpre_gateway_nid; + *lo_nid = re->kpre_lo_nid; + *hi_nid = re->kpre_hi_nid; + + read_unlock(&kpr_rwlock); + return (0); + } + } + + read_unlock (&kpr_rwlock); + return (-ENOENT); +} + +static void __exit +kpr_finalise (void) +{ + LASSERT (list_empty (&kpr_nals)); + + while (!list_empty (&kpr_routes)) { + kpr_route_entry_t *re = list_entry(kpr_routes.next, + kpr_route_entry_t, + kpre_list); + + list_del(&re->kpre_list); + PORTAL_FREE(re, sizeof (*re)); + } + + kpr_proc_fini(); + + PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); + PORTAL_SYMBOL_UNREGISTER(kpr_control_interface); + + CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kpr_initialise (void) +{ + CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", + atomic_read(&portal_kmemory)); + + rwlock_init(&kpr_rwlock); + INIT_LIST_HEAD(&kpr_routes); + INIT_LIST_HEAD(&kpr_nals); + + kpr_proc_init(); + + PORTAL_SYMBOL_REGISTER(kpr_router_interface); + PORTAL_SYMBOL_REGISTER(kpr_control_interface); + return (0); +} + +MODULE_AUTHOR("Eric Barton"); +MODULE_DESCRIPTION("Kernel Portals Router v0.01"); +MODULE_LICENSE("GPL"); + +module_init (kpr_initialise); +module_exit (kpr_finalise); + +EXPORT_SYMBOL (kpr_control_interface); +EXPORT_SYMBOL (kpr_router_interface); diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h new file mode 100644 index 0000000..b8c3bec --- /dev/null +++ b/lustre/portals/router/router.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _KPTLROUTER_H +#define _KPTLROUTER_H +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLROUTER + +#include +#include +#include + +typedef struct +{ + struct list_head kpne_list; + kpr_nal_interface_t kpne_interface; + atomic_t kpne_refcount; + int kpne_shutdown; +} kpr_nal_entry_t; + +typedef struct +{ + struct list_head kpre_list; + int kpre_gateway_nalid; + ptl_nid_t kpre_gateway_nid; + ptl_nid_t kpre_lo_nid; + ptl_nid_t kpre_hi_nid; +} kpr_route_entry_t; + +extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_shutdown_nal (void *arg); +extern void kpr_deregister_nal (void *arg); + +extern void kpr_proc_init (void); +extern void kpr_proc_fini (void); + +extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); +extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + +extern unsigned long long kpr_fwd_bytes; +extern unsigned long kpr_fwd_packets; +extern unsigned long kpr_fwd_errors; +extern atomic_t kpr_queue_depth; + +#endif /* _KPLROUTER_H */ diff --git a/lustre/portals/tests/.cvsignore b/lustre/portals/tests/.cvsignore new file mode 100644 index 0000000..051d1bd --- /dev/null +++ b/lustre/portals/tests/.cvsignore @@ -0,0 +1,3 @@ +Makefile +Makefile.in +.deps diff --git a/lustre/portals/tests/Makefile.am b/lustre/portals/tests/Makefile.am new file mode 100644 index 0000000..7b47ae0 --- /dev/null +++ b/lustre/portals/tests/Makefile.am @@ -0,0 +1,23 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r +LINK = $(LD) $(LDFLAGS) -o $@ +DEFS = +LIBS = +MODULE = $(basename) +EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh + +noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o + +pingsrv_o_SOURCES = ping_srv.c ping.h + +pingcli_o_SOURCES = ping_cli.c ping.h + +spingsrv_o_SOURCES = sping_srv.c ping.h + +spingcli_o_SOURCES = sping_cli.c ping.h diff --git a/lustre/portals/tests/ping.h b/lustre/portals/tests/ping.h new file mode 100644 index 0000000..f07444b --- /dev/null +++ b/lustre/portals/tests/ping.h @@ -0,0 +1,80 @@ +#ifndef _KPING_INCLUDED +#define _KPING_INCLUDED + +#include + + +#define PTL_PING_IN_SIZE 256 // n packets per buffer +#define PTL_PING_IN_BUFFERS 2 // n fallback buffers + +#define PTL_PING_CLIENT 4 +#define PTL_PING_SERVER 5 + +#define PING_HEADER_MAGIC 0xDEADBEEF +#define PING_BULK_MAGIC 0xCAFEBABE + +#define PING_HEAD_BITS 0x00000001 +#define PING_BULK_BITS 0x00000002 +#define PING_IGNORE_BITS 0xFFFFFFFC + +#define PTL_PING_ACK 0x01 +#define PTL_PING_VERBOSE 0x02 +#define PTL_PING_VERIFY 0x04 +#define PTL_PING_PREALLOC 0x08 + + +#define NEXT_PRIMARY_BUFFER(index) \ + (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) + +#define PDEBUG(str, err) \ + CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) + + +/* Ping data to be passed via the ioctl to kernel space */ + +#if __KERNEL__ + + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +struct pingsrv_data { + + ptl_handle_ni_t ni; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + void *in_buf; + ptl_process_id_t my_id; + ptl_process_id_t id_local; + ptl_md_t mdin; + ptl_md_t mdout; + ptl_handle_md_t mdin_h; + ptl_handle_md_t mdout_h; + ptl_event_t evnt; + struct task_struct *tsk; +}; /* struct pingsrv_data */ + +struct pingcli_data { + + struct portal_ioctl_data *args; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + char *inbuf; + char *outbuf; + ptl_process_id_t myid; + ptl_process_id_t id_local; + ptl_process_id_t id_remote; + ptl_md_t md_in_head; + ptl_md_t md_out_head; + ptl_handle_md_t md_in_head_h; + ptl_handle_md_t md_out_head_h; + ptl_event_t ev; + struct task_struct *tsk; +}; /* struct pingcli_data */ + + +#endif /* __KERNEL__ */ + +#endif /* _KPING_INCLUDED */ diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c new file mode 100644 index 0000000..389ffbb --- /dev/null +++ b/lustre/portals/tests/ping_cli.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) + +#define MAX_TIME 100000 + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + if ((rc = PtlMDUnlink (client->md_in_head_h))) + PDEBUG ("PtlMDUnlink", rc); + + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + int i, magic; + i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned)); + magic = *(int *)(ev->mem_desc.start + ev->offset); + + if(magic != 0xcafebabe) { + printk ("Unexpected response \n"); + return 1; + } + + if((i == count) || !count) + wake_up_process (client->tsk); + else + printk ("Received response after timeout for %d\n",i); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + unsigned ping_bulk_magic = PING_BULK_MAGIC; + int rc; + struct timeval tv1, tv2; + client->tsk = current; + client->args = args; + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + count = args->ioc_count; + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = (args->ioc_size + STDSIZE) + * count; + client->md_in_head.threshold = PTL_MD_THRESH_INF; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE + args->ioc_size; + client->md_out_head.threshold = args->ioc_count; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); + + count = 0; + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return NULL; + } + while ((args->ioc_count - count)) { + memcpy (client->outbuf + sizeof(unsigned), + &(count), sizeof(unsigned)); + /* Put the ping packet */ + do_gettimeofday (&tv1); + + memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, + sizeof(struct timeval)); + + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + printk ("sent msg no %d", count); + + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" :: timeout .....\n"); + } else { + do_gettimeofday (&tv2); + printk(" :: Reply in %u usec\n", + (unsigned)((tv2.tv_sec - tv1.tv_sec) + * 1000000 + (tv2.tv_usec - tv1.tv_usec))); + } + count++; + } + + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + memset (client, 0, sizeof(struct pingcli_data)); + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c new file mode 100644 index 0000000..1037d09 --- /dev/null +++ b/lustre/portals/tests/ping_srv.c @@ -0,0 +1,308 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) +#define MAXSIZE (16*1024*1024) + +static unsigned ping_head_magic; +static unsigned ping_bulk_magic; +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + case 5: + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, MAXSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + unsigned long magic; + unsigned long ping_bulk_magic = 0xcafebabe; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + magic = *((int *)(server->evnt.mem_desc.start + + server->evnt.offset)); + + + if(magic != 0xdeadbeef) { + printk("Unexpected Packet to the server\n"); + + } + memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); + + server->mdout.length = server->evnt.rlength; + server->mdout.start = server->in_buf; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset)), + *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))), + *((int *)(ev->mem_desc.start + ev->offset + 2 * + sizeof(unsigned)))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "NAL %d not loaded\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, MAXSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + ping_head_magic = PING_HEADER_MAGIC; + ping_bulk_magic = PING_BULK_MAGIC; + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c new file mode 100644 index 0000000..4cef08b --- /dev/null +++ b/lustre/portals/tests/sping_cli.c @@ -0,0 +1,276 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes + assumed */ + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, STDSIZE); + + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + wake_up_process (client->tsk); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + const ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + int rc; + + client->tsk = current; + client->args = args; + + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, STDSIZE); + + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded.\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = STDSIZE; + client->md_in_head.threshold = 1; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, STDSIZE); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE; + client->md_out_head.threshold = 1; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Put the ping packet */ + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + + count = 0; + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" Time out on the server\n"); + pingcli_shutdown (2); + return NULL; + } else + printk("Received respose from the server \n"); + + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + memset (client, 0, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c new file mode 100644 index 0000000..a18ea35 --- /dev/null +++ b/lustre/portals/tests/sping_srv.c @@ -0,0 +1,295 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) + +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#endif + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, STDSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + server->mdout.start = server->in_buf; + server->mdout.length = STDSIZE; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, STDSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/startclient.sh b/lustre/portals/tests/startclient.sh new file mode 100755 index 0000000..c9b7c16 --- /dev/null +++ b/lustre/portals/tests/startclient.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingcli.o +else + PING=spingcli.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +exit 0; diff --git a/lustre/portals/tests/startserver.sh b/lustre/portals/tests/startserver.sh new file mode 100755 index 0000000..942300e --- /dev/null +++ b/lustre/portals/tests/startserver.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingsrv.o +else + PING=spingsrv.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING nal=4 + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING nal=2 + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING nal=4 + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +../utils/acceptor 9999& +exit 0; diff --git a/lustre/portals/tests/stopclient.sh b/lustre/portals/tests/stopclient.sh new file mode 100755 index 0000000..f7e3aa1 --- /dev/null +++ b/lustre/portals/tests/stopclient.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingcli +else + PING=pingcli +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +rmmod portals diff --git a/lustre/portals/tests/stopserver.sh b/lustre/portals/tests/stopserver.sh new file mode 100644 index 0000000..3e81831 --- /dev/null +++ b/lustre/portals/tests/stopserver.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingsrv +else + PING=pingsrv +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +killall -9 acceptor +rm -f /var/run/acceptor-9999.pid +rmmod portals diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am new file mode 100644 index 0000000..b62b401 --- /dev/null +++ b/lustre/portals/unals/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lustre/portals/unals/README b/lustre/portals/unals/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lustre/portals/unals/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lustre/portals/unals/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include +#include +#include +#include +#include +#include +#include + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lustre/portals/unals/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c new file mode 100644 index 0000000..89c9f78 --- /dev/null +++ b/lustre/portals/unals/connection.c @@ -0,0 +1,293 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offsetm->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h new file mode 100644 index 0000000..f6b2994 --- /dev/null +++ b/lustre/portals/unals/connection.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned int short); +manager init_connections(unsigned short, + int (*f)(void *,connection), + void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, + unsigned char *dest, + int len); diff --git a/lustre/portals/unals/debug.c b/lustre/portals/unals/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lustre/portals/unals/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lustre/portals/unals/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lustre/portals/unals/ipmap.h b/lustre/portals/unals/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lustre/portals/unals/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lustre/portals/unals/pqtimer.c b/lustre/portals/unals/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lustre/portals/unals/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include +#include +#include + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lustre/portals/unals/pqtimer.h b/lustre/portals/unals/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lustre/portals/unals/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lustre/portals/unals/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lustre/portals/unals/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include +#include +#include + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lustre/portals/unals/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lustre/portals/unals/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lustre/portals/unals/table.c b/lustre/portals/unals/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lustre/portals/unals/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;inext; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;isize;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lustre/portals/unals/table.h b/lustre/portals/unals/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lustre/portals/unals/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c new file mode 100644 index 0000000..8bf55c4 --- /dev/null +++ b/lustre/portals/unals/tcpnal.c @@ -0,0 +1,196 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a,connection c) +{ + bridge b=a; + ptl_hdr_t hdr; + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lustre/portals/unals/timer.h b/lustre/portals/unals/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lustre/portals/unals/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lustre/portals/unals/utypes.h b/lustre/portals/unals/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lustre/portals/unals/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lustre/portals/utils/.cvsignore b/lustre/portals/utils/.cvsignore new file mode 100644 index 0000000..041cd6b --- /dev/null +++ b/lustre/portals/utils/.cvsignore @@ -0,0 +1,7 @@ +Makefile +Makefile.in +acceptor +debugctl +ptlctl +.deps +routerstat diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am new file mode 100644 index 0000000..065fcf9 --- /dev/null +++ b/lustre/portals/utils/Makefile.am @@ -0,0 +1,25 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +COMPILE = gcc -Wall -g -I$(srcdir)/../include +LINK = gcc -o $@ + +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat +lib_LIBRARIES = libptlctl.a + +acceptor_SOURCES = acceptor.c # -lefence + +libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h + +ptlctl_SOURCES = ptlctl.c +ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_DEPENDENCIES = libptlctl.a + +debugctl_SOURCES = debugctl.c +debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_DEPENDENCIES = libptlctl.a + +routerstat_SOURCES = routerstat.c diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c new file mode 100644 index 0000000..c6590db --- /dev/null +++ b/lustre/portals/utils/acceptor.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* should get this from autoconf somehow */ +#ifndef PIDFILE_DIR +#define PIDFILE_DIR "/var/run" +#endif + +#define PROGNAME "acceptor" + +void create_pidfile(char *name, int port) +{ + char pidfile[1024]; + FILE *fp; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if ((fp = fopen(pidfile, "w"))) { + fprintf(fp, "%d\n", getpid()); + fclose(fp); + } else { + syslog(LOG_ERR, "%s: %s\n", pidfile, + strerror(errno)); + } +} + +int pidfile_exists(char *name, int port) +{ + char pidfile[1024]; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if (!access(pidfile, F_OK)) { + fprintf(stderr, "%s: exists, acceptor already running.\n", + pidfile); + return (1); + } + return (0); +} + +int +parse_size (int *sizep, char *str) +{ + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) + { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +void +show_connection (int fd, __u32 net_ip, ptl_nid_t nid) +{ + struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); + __u32 host_ip = ntohl (net_ip); + int rxmem = 0; + int txmem = 0; + int nonagle = 0; + int len; + char host[1024]; + + len = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0) + perror ("Cannot get write buffer size"); + + len = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0) + perror ("Cannot get read buffer size"); + + len = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0) + perror ("Cannot get nagle"); + + if (h == NULL) + snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, + (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); + else + snprintf (host, sizeof(host), "%s", h->h_name); + + syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", + host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled"); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +void +usage (char *myname) +{ + fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname); + exit (1); +} + +int main(int argc, char **argv) +{ + int o, fd, rc, port, pfd; + struct sockaddr_in srvaddr; + int c; + int rxmem = 0; + int txmem = 0; + int noclose = 0; + int nonagle = 1; + int nal = SOCKNAL; + int xchg_nids = 0; + int bind_irq = 0; + + while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1) + switch (c) + { + case 'r': + if (parse_size (&rxmem, optarg) != 0 || rxmem < 0) + usage (argv[0]); + break; + + case 's': + if (parse_size (&txmem, optarg) != 0 || txmem < 0) + usage (argv[0]); + break; + + case 'n': + nonagle = 0; + break; + + case 'l': + noclose = 1; + break; + + case 'x': + xchg_nids = 1; + break; + + case 'i': + bind_irq = 1; + break; + + case 'N': + if (parse_size(&nal, optarg) != 0 || + nal < 0 || nal > NAL_MAX_NR) + usage(argv[0]); + break; + + default: + usage (argv[0]); + break; + } + + if (optind >= argc) + usage (argv[0]); + + port = atol(argv[optind++]); + + if (pidfile_exists(PROGNAME, port)) + exit(1); + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = INADDR_ANY; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("opening socket"); + exit(1); + } + + o = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { + perror("Cannot set REUSEADDR socket opt"); + exit(1); + } + + if (nonagle) + { + o = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); + if (rc != 0) + { + perror ("Cannot disable nagle"); + exit (1); + } + } + + if (txmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem)); + if (rc != 0) + { + perror ("Cannot set write buffer size"); + exit (1); + } + } + + if (rxmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem)); + if (rc != 0) + { + perror ("Cannot set read buffer size"); + exit (1); + } + } + + rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + perror("bind: "); + exit(1); + } + + if (listen(fd, 127)) { + perror("listen: "); + exit(1); + } + fprintf(stderr, "listening on port %d\n", port); + + pfd = open("/dev/portals", O_RDWR); + if ( pfd < 0 ) { + perror("opening portals device"); + exit(1); + } + + rc = daemon(1, noclose); + if (rc < 0) { + perror("daemon(): "); + exit(1); + } + + openlog(PROGNAME, LOG_PID, LOG_DAEMON); + syslog(LOG_INFO, "started, listening on port %d\n", port); + create_pidfile(PROGNAME, port); + + while (1) { + struct sockaddr_in clntaddr; + int len = sizeof(clntaddr); + int cfd; + struct portal_ioctl_data data; + ptl_nid_t peer_nid; + + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); + if ( cfd < 0 ) { + perror("accept"); + exit(0); + continue; + } + + if (!xchg_nids) + peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */ + else + { + PORTAL_IOC_INIT (data); + data.ioc_nal = nal; + rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + { + perror ("Can't get my NID"); + close (cfd); + continue; + } + + rc = exchange_nids (cfd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (cfd); + continue; + } + } + + show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid); + + PORTAL_IOC_INIT(data); + data.ioc_fd = cfd; + data.ioc_nal = nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { + perror("ioctl failed"); + + } else { + printf("client registered\n"); + } + rc = close(cfd); + if (rc) + perror ("close failed"); + } + + closelog(); + exit(0); + +} diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c new file mode 100644 index 0000000..13572dc --- /dev/null +++ b/lustre/portals/utils/debug.c @@ -0,0 +1,620 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define BUG() /* workaround for module.h includes */ +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#endif + +#include +#include +#include "parser.h" + +static char rawbuf[8192]; +static char *buf = rawbuf; +static int max = 8192; +//static int g_pfd = -1; +static int subsystem_array[1 << 8]; +static int debug_mask = ~0; + +static const char *portal_debug_subsystems[] = + {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite", + "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter", + "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL}; +static const char *portal_debug_masks[] = + {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", + "blocks", "net", "warning", "buffs", "other", "dentry", "portals", + "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL}; + +struct debug_daemon_cmd { + char *cmd; + unsigned int cmdv; +}; + +static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { + {"start", DEBUG_DAEMON_START}, + {"stop", DEBUG_DAEMON_STOP}, + {"pause", DEBUG_DAEMON_PAUSE}, + {"continue", DEBUG_DAEMON_CONTINUE}, + {0, 0} +}; + +static int do_debug_mask(char *name, int enable) +{ + int found = 0, i; + + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || + strcasecmp(name, "all_subs") == 0) { + printf("%s output from subsystem \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_subsystems[i]); + subsystem_array[i] = enable; + found = 1; + } + } + for (i = 0; portal_debug_masks[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_masks[i]) == 0 || + strcasecmp(name, "all_types") == 0) { + printf("%s output of type \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_masks[i]); + if (enable) + debug_mask |= (1 << i); + else + debug_mask &= ~(1 << i); + found = 1; + } + } + + return found; +} + +int dbg_initialize(int argc, char **argv) +{ + memset(subsystem_array, 1, sizeof(subsystem_array)); + return 0; +} + +int jt_dbg_filter(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 0)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + return 0; +} + +int jt_dbg_show(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 1)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + + return 0; +} + +static int applymask(char* procpath, int value) +{ + int rc; + char buf[64]; + int len = snprintf(buf, 64, "%d", value); + + int fd = open(procpath, O_WRONLY); + if (fd == -1) { + fprintf(stderr, "Unable to open %s: %s\n", + procpath, strerror(errno)); + return fd; + } + rc = write(fd, buf, len+1); + if (rc<0) { + fprintf(stderr, "Write to %s failed: %s\n", + procpath, strerror(errno)); + return rc; + } + close(fd); + return 0; +} + +extern char *dump_filename; +extern int dump(int dev_id, int opc, void *buf); + +static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) +{ + if (!dump_filename) { + applymask("/proc/sys/portals/subsystem_debug", subs_mask); + applymask("/proc/sys/portals/debug", debug_mask); + } else { + struct portals_debug_ioctl_data data; + + data.hdr.ioc_len = sizeof(data); + data.hdr.ioc_version = 0; + data.subs = subs_mask; + data.debug = debug_mask; + + dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); + } + printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", + subs_mask, debug_mask); +} + +int jt_dbg_list(int argc, char **argv) +{ + int i; + + if (argc != 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 0; + } + + if (strcasecmp(argv[1], "subs") == 0) { + printf("Subsystems: all_subs"); + for (i = 0; portal_debug_subsystems[i] != NULL; i++) + printf(", %s", portal_debug_subsystems[i]); + printf("\n"); + } else if (strcasecmp(argv[1], "types") == 0) { + printf("Types: all_types"); + for (i = 0; portal_debug_masks[i] != NULL; i++) + printf(", %s", portal_debug_masks[i]); + printf("\n"); + } + else if (strcasecmp(argv[1], "applymasks") == 0) { + unsigned int subsystem_mask = 0; + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (subsystem_array[i]) subsystem_mask |= (1 << i); + } + applymask_all(subsystem_mask, debug_mask); + } + return 0; +} + +/* if 'raw' is true, don't strip the debug information from the front of the + * lines */ +static void dump_buffer(FILE *fd, char *buf, int size, int raw) +{ + char *p, *z; + unsigned long subsystem, debug, dropped = 0, kept = 0; + int max_sub, max_type; + + for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++) + ; + for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++) + ; + + while (size) { + p = memchr(buf, '\n', size); + if (!p) + break; + subsystem = strtoul(buf, &z, 16); + debug = strtoul(z + 1, &z, 16); + + z++; + /* for some reason %*s isn't working. */ + *p = '\0'; + if (subsystem < max_sub && + subsystem_array[subsystem] && + (!debug || (debug_mask & debug))) { + if (raw) + fprintf(fd, "%s\n", buf); + else + fprintf(fd, "%s\n", z); + //printf("%s\n", buf); + kept++; + } else { + //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf); + dropped++; + } + *p = '\n'; + p++; + size -= (p - buf); + buf = p; + } + + printf("Debug log: %lu lines, %lu kept, %lu dropped.\n", + dropped + kept, kept, dropped); +} + +int jt_dbg_debug_kernel(int argc, char **argv) +{ + int rc, raw = 1; + FILE *fd = stdout; + const int databuf_size = (6 << 20); + struct portal_ioctl_data data, *newdata; + char *databuf = NULL; + + if (argc > 3) { + fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); + return 0; + } + + if (argc > 1) { + fd = fopen(argv[1], "w"); + if (fd == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } + } + if (argc > 2) + raw = atoi(argv[2]); + + databuf = malloc(databuf_size); + if (!databuf) { + fprintf(stderr, "No memory for buffer.\n"); + goto out; + } + + memset(&data, 0, sizeof(data)); + data.ioc_plen1 = databuf_size; + data.ioc_pbuf1 = databuf; + + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + goto out; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n", + strerror(errno)); + goto out; + } + + newdata = (struct portal_ioctl_data *)buf; + if (newdata->ioc_size > 0) + dump_buffer(fd, databuf, newdata->ioc_size, raw); + else + fprintf(stderr, "No data in the debug buffer.\n"); + + out: + if (databuf) + free(databuf); + if (fd != stdout) + fclose(fd); + return 0; +} + +int jt_dbg_debug_daemon(int argc, char **argv) +{ + int i, rc; + unsigned int cmd = 0; + FILE *fd = stdout; + struct portal_ioctl_data data; + + if (argc <= 1) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) { + if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) { + cmd = portal_debug_daemon_cmd[i].cmdv; + break; + } + } + if (portal_debug_daemon_cmd[i].cmd == NULL) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + memset(&data, 0, sizeof(data)); + if (cmd == DEBUG_DAEMON_START) { + if (argc < 3) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|" + "pause|continue]\n", argv[0]); + return 0; + } + if (access(argv[2], F_OK) != 0) { + fd = fopen(argv[2], "w"); + if (fd != NULL) { + fclose(fd); + remove(argv[2]); + goto ok; + } + } + if (access(argv[2], W_OK) == 0) + goto ok; + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + return -1; +ok: + data.ioc_inllen1 = strlen(argv[2]) + 1; + data.ioc_inlbuf1 = argv[2]; + data.ioc_misc = 0; + if (argc == 4) { + unsigned long size; + errno = 0; + size = strtoul(argv[3], NULL, 0); + if (errno) { + fprintf(stderr, "file size(%s): error %s\n", + argv[3], strerror(errno)); + return -1; + } + data.ioc_misc = size; + } + } + data.ioc_count = cmd; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf); + if (rc < 0) { + fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n", + strerror(errno)); + return rc; + } + return 0; +} + +int jt_dbg_debug_file(int argc, char **argv) +{ + int rc, fd = -1, raw = 1; + FILE *output = stdout; + char *databuf = NULL; + struct stat statbuf; + + if (argc > 4 || argc < 2) { + fprintf(stderr, "usage: %s [output] [raw]\n", argv[0]); + return 0; + } + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + rc = syscall(__SYS_fstat__, fd, &statbuf); + if (rc < 0) { + fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + goto out; + } + + if (argc >= 3) { + output = fopen(argv[2], "w"); + if (output == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + goto out; + } + } + + if (argc == 4) + raw = atoi(argv[3]); + + databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (databuf == NULL) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + goto out; + } + + dump_buffer(output, databuf, statbuf.st_size, raw); + + out: + if (databuf) + munmap(databuf, statbuf.st_size); + if (output != stdout) + fclose(output); + if (fd > 0) + close(fd); + return 0; +} + +int jt_dbg_clear_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_dbg_mark_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + char *text; + time_t now = time(NULL); + + if (argc > 2) { + fprintf(stderr, "usage: %s [marker text]\n", argv[0]); + return 0; + } + + if (argc == 2) { + text = argv[1]; + } else { + text = ctime(&now); + text[strlen(text) - 1] = '\0'; /* stupid \n */ + } + + memset(&data, 0, sizeof(data)); + data.ioc_inllen1 = strlen(text) + 1; + data.ioc_inlbuf1 = text; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + + +int jt_dbg_modules(int argc, char **argv) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct mod_paths { + char *name, *path; + } *mp, mod_paths[] = { + {"portals", "portals/linux/oslib"}, + {"ksocknal", "portals/linux/socknal"}, + {"obdclass", "lustre/obdclass"}, + {"ptlrpc", "lustre/ptlrpc"}, + {"obdext2", "lustre/obdext2"}, + {"ost", "lustre/ost"}, + {"osc", "lustre/osc"}, + {"mds", "lustre/mds"}, + {"mdc", "lustre/mdc"}, + {"llite", "lustre/llite"}, + {"obdecho", "lustre/obdecho"}, + {"ldlm", "lustre/ldlm"}, + {"obdfilter", "lustre/obdfilter"}, + {"extN", "lustre/extN"}, + {"lov", "lustre/lov"}, + {"fsfilt_ext3", "lustre/obdclass"}, + {"fsfilt_extN", "lustre/obdclass"}, + {"mds_ext2", "lustre/mds"}, + {"mds_ext3", "lustre/mds"}, + {"mds_extN", "lustre/mds"}, + {"ptlbd", "lustre/ptlbd"}, + {NULL, NULL} + }; + char *path = ".."; + char *kernel = "linux"; + + if (argc >= 2) + path = argv[1]; + if (argc == 3) + kernel = argv[2]; + if (argc > 3) { + printf("%s [path] [kernel]\n", argv[0]); + return 0; + } + + for (mp = mod_paths; mp->name != NULL; mp++) { + struct module_info info; + int rc; + size_t crap; + int query_module(const char *name, int which, void *buf, + size_t bufsize, size_t *ret); + + rc = query_module(mp->name, QM_INFO, &info, sizeof(info), + &crap); + if (rc < 0) { + if (errno != ENOENT) + printf("query_module(%s) failed: %s\n", + mp->name, strerror(errno)); + } else { + printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, + mp->path, mp->name, + info.addr + sizeof(struct module)); + } + } + + return 0; +#else + printf("jt_dbg_module is not yet implemented for Linux 2.5\n"); + return 0; +#endif /* linux 2.5 */ +} + +int jt_dbg_panic(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} diff --git a/lustre/portals/utils/debugctl.c b/lustre/portals/utils/debugctl.c new file mode 100644 index 0000000..02cb9b4 --- /dev/null +++ b/lustre/portals/utils/debugctl.c @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include "parser.h" + + +command_t list[] = { + {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"}, + {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, + {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file [output] [raw], read debug buffer from input and print it [to output]"}, + {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"}, + {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"}, + {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"}, + {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"}, + {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"}, + {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: )"}, + {"panic", jt_dbg_panic, 0, "cause the kernel to panic"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (dbg_initialize(argc, argv) < 0) + exit(2); + + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + + Parser_init("debugctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + unregister_ioc_dev(PORTALS_DEV_ID); + return 0; +} diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c new file mode 100644 index 0000000..722bb57 --- /dev/null +++ b/lustre/portals/utils/l_ioctl.c @@ -0,0 +1,281 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ioc_dev { + const char * dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +struct dump_hdr { + int magic; + int dev_id; + int opc; +}; + +char * dump_filename; + +static int +open_ioc_dev(int dev_id) +{ + const char * dev_name; + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + + +static int +do_ioctl(int dev_id, int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + return rc; + +} + +static FILE * +get_dump_file() +{ + FILE *fp = NULL; + + if (!dump_filename) { + fprintf(stderr, "no dump filename\n"); + } else + fp = fopen(dump_filename, "a"); + return fp; +} + +/* + * The dump file should start with a description of which devices are + * used, but for now it will assumed whatever app reads the file will + * know what to do. */ +int +dump(int dev_id, int opc, void *buf) +{ + FILE *fp; + struct dump_hdr dump_hdr; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + int rc; + + printf("dumping opc %x to %s\n", opc, dump_filename); + + + dump_hdr.magic = 0xdeadbeef; + dump_hdr.dev_id = dev_id; + dump_hdr.opc = opc; + + fp = get_dump_file(); + if (fp == NULL) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char * dev_name) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return; + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +/* If this file is set, then all ioctl buffers will be + appended to the file. */ +int +set_ioctl_dump(char * file) +{ + if (dump_filename) + free(dump_filename); + + dump_filename = strdup(file); + return 0; +} + +int +l_ioctl(int dev_id, int opc, void *buf) +{ + if (dump_filename) + return dump(dev_id, opc, buf); + else + return do_ioctl(dev_id, opc, buf); +} + +/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer + * in the file. For example: + * + * parse_dump("lctl.dump", l_ioctl); + * + * Note: if using l_ioctl, then you also need to register_ioc_dev() for + * each device used in the dump. + */ +int +parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) +{ + int fd, line =0; + struct stat st; + char *buf, *end; + + fd = syscall(SYS_open, dump_file, O_RDONLY); + +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + if (syscall(__SYS_fstat__, fd, &st)) { + perror("stat fails"); + exit(1); + } + + if (st.st_size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } + + buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = buf + st.st_size; + close(fd); + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } +#if 0 + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); +#endif + + memcpy(tmp, data, data->ioc_len); + + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } + + buf += data->ioc_len + sizeof(*dump_hdr); + } + return 0; +} + +int +jt_ioc_dump(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + printf("setting dumpfile to: %s\n", argv[1]); + + set_ioctl_dump(argv[1]); + return 0; +} diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c new file mode 100644 index 0000000..4d93645 --- /dev/null +++ b/lustre/portals/utils/parser.c @@ -0,0 +1,703 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef HAVE_LIBREADLINE +#define READLINE_LIBRARY +#include +#endif +//extern char **completion_matches __P((char *, rl_compentry_func_t *)); +extern void using_history(void); +extern void stifle_history(int); +extern void add_history(char *); + +#include "parser.h" + +static command_t * top_level; /* Top level of commands, initialized by + * InitParser */ +static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ +static int done; /* Set to 1 if user types exit or quit */ + + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); +static void print_commands(char *str, command_t *table); + +static char * skipwhitespace(char * s) +{ + char * t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++); + return(t); +} + + +static char * skiptowhitespace(char * s) +{ + char * t; + + for (t = s; *t && !isspace(*t); t++); + return(t); +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if ( arg ) { + argv[i] = arg; + i++; + } else + return 0; + + while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) { + argv[i] = arg; + i++; + } + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if ( cmd ) { + return (cmd->pc_func)(argc, argv); + } else { + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\" ", cmd->pc_name); + printf("\nas argument.\n"); + } + return -1; +} + +/* returns the command_t * (NULL if not found) corresponding to a + _partial_ match with the first token in name. It sets *next to + point to the following token. Does not modify *name. */ +static command_t * find_cmd(char * name, command_t cmds[], char ** next) +{ + int i, len; + + if (!cmds || !name ) + return NULL; + + /* This sets name to point to the first non-white space character, + and next to the first whitespace after name, len to the length: do + this with strtok*/ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = *next - name; + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return(&cmds[i]); + } + } + return NULL; +} + +/* Recursively process a command line string s and find the command + corresponding to it. This can be ambiguous, full, incomplete, + non-existent. */ +static int process(char *s, char ** next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if ( ! *result ) + return CMD_NONE; + + /* found entry: is it ambigous, i.e. not exact command name and + more than one command in the list matches. Note that find_cmd + points to the first ambiguous entry */ + if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) && + find_cmd(s, (*result) + 1, next)) + return CMD_AMBIG; + + /* found a unique command: component or full? */ + if ( (*result)->pc_func ) { + return CMD_COMPLETE; + } else { + if ( *next == '\0' ) { + return CMD_INCOMPLETE; + } else { + return process(*next, next, (*result)->pc_sub_cmd, result, prev); + } + } +} + +#ifdef HAVE_LIBREADLINE +static command_t * match_tbl; /* Command completion against this table */ +static char * command_generator(const char * text, int state) +{ + static int index, + len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ( (name = (match_tbl + index)->pc_name) ) { + index++; + + if (strncasecmp(name, text, len) == 0) { + return(strdup(name)); + } + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(char * text, int start, int end) +{ + command_t * table; + char * pos; + + match_tbl = top_level; + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; + table = find_cmd(pos, match_tbl, &pos)) { + + if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; + } + + return(completion_matches(text, command_generator)); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char * line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch( process(line, &next, top_level, &cmd, &prev) ) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, cmd, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + i = line2args(line, argv, MAXARGS); + rc = (cmd->pc_func)(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +int +noop_fn () +{ + return (0); +} + +/* just in case you're ever in an airplane and discover you + forgot to install readline-dev. :) */ +int init_input() +{ + int interactive = isatty (fileno (stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) + { + rl_prep_term_function = (rl_vintfunc_t *)noop_fn; + rl_deprep_term_function = (rl_voidfunc_t *)noop_fn; + } + + rl_attempted_completion_function = (CPPFunction *)command_completion; + rl_completion_entry_function = (void *)command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char * readline(char * prompt) +{ + char line[2048]; + int n = 0; + if (prompt) + printf ("%s", prompt); + if (fgets(line, sizeof(line), stdin) == NULL) + return (NULL); + n = strlen(line); + if (n && line[n-1] == '\n') + line[n-1] = '\0'; + return strdup(line); +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0; + int interactive; + + interactive = init_input(); + + while(!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + + free(line); + } + return rc; +} + + +/* sets the parser prompt */ +void Parser_init(char * prompt, command_t * cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') + ret = sscanf(s, "%d", val); + else if (*(s+1) != 'x') + ret = sscanf(s, "%o", val); + else { + s++; + ret = sscanf(++s, "%x", val); + } + + return(ret); +} + + +void Parser_qhelp(int argc, char *argv[]) { + + printf("Available commands are:\n"); + + print_commands(NULL, top_level); + printf("For more help type: help command-name\n"); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if ( argc == 1 ) { + Parser_qhelp(argc, argv); + return 0; + } + + line[0]='\0'; + for ( i = 1 ; i < argc ; i++ ) { + strcat(line, argv[i]); + } + + switch ( process(line, &next, top_level, &result, &prev) ) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n",line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, result, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + Parser_help(2, argv); +} + +/************************************************************************* + * COMMANDS * + *************************************************************************/ + + +static void print_commands(char * str, command_t * table) { + command_t * cmds; + char buf[80]; + + for (cmds = table; cmds->pc_name; cmds++) { + if (cmds->pc_func) { + if (str) printf("\t%s %s\n", str, cmds->pc_name); + else printf("\t%s\n", cmds->pc_name); + } + if (cmds->pc_sub_cmd) { + if (str) { + sprintf(buf, "%s %s", str, cmds->pc_name); + print_commands(buf, cmds->pc_sub_cmd); + } else { + print_commands(cmds->pc_name, cmds->pc_sub_cmd); + } + } + } +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + theprompt = malloc(size); + assert(theprompt); + + sprintf(theprompt, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + if ( line == NULL || *line == '\0' ) { + strncpy(res, deft, len); + } else { + strncpy(res, line, len); + } + + if ( line ) { + free(line); + return res; + } else { + return NULL; + } +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + assert(theprompt); + sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); + + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if ( !line ) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if ( *line == '\0' ) { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if ( rc != 0 ) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if ( result > max || result < min ) { + fprintf(stdout, "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while ( 1 ) ; + + if (theprompt) + free(theprompt); + return result; + +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + assert(theprompt); + + fflush(stdout); + + if ( deft != 0 && deft != 1 ) { + fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n", + deft); + assert ( 0 ); + } + sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if ( line == NULL ) { + result = deft; + break; + } + if ( *line == '\0' ) { + result = deft; + break; + } + if ( *line == 'y' || *line == 'Y' ) { + result = 1; + break; + } + if ( *line == 'n' || *line == 'N' ) { + result = 0; + break; + } + if ( line ) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while ( 1 ); + + if ( line ) + free(line); + if ( theprompt ) + free(theprompt); + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + + if ( rc == 0 ) { + return result; + } else { + return Parser_getint(prompt, deft, min, max, base); + } +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if ( inp == NULL || *inp == '\0' ) { + return Parser_getstr(prompt, deft, answer, len); + } else + return inp; +} + +/* change a string into a number: return 0 on success. No invalid characters + allowed. The processing of base and validity follows strtol(3)*/ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ( (base !=0) && (base < 2 || base > 36) ) + return 1; + + *result = strtol(inp, &endptr, base); + + if ( *inp != '\0' && *endptr == '\0' ) + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size (int *sizep, char *str) { + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h new file mode 100644 index 0000000..dead9f5 --- /dev/null +++ b/lustre/portals/utils/parser.h @@ -0,0 +1,73 @@ +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 100 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(int *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c new file mode 100644 index 0000000..8235271 --- /dev/null +++ b/lustre/portals/utils/portals.c @@ -0,0 +1,1005 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "parser.h" + +unsigned int portal_debug; +unsigned int portal_printk; +unsigned int portal_stack; + + +static ptl_nid_t g_nid = 0; +static unsigned int g_nal = 0; +static unsigned short g_port = 0; + +static int g_socket_txmem = 0; +static int g_socket_rxmem = 0; +static int g_socket_nonagle = 1; + +typedef struct +{ + char *name; + int num; +} name2num_t; + +static name2num_t nalnames[] = { + {"tcp", SOCKNAL}, + {"toe", TOENAL}, + {"elan", QSWNAL}, + {"gm", GMNAL}, + {"scimac", SCIMACNAL}, + {NULL, -1} +}; + +static name2num_t * +name2num_lookup_name (name2num_t *table, char *str) +{ + while (table->name != NULL) + if (!strcmp (str, table->name)) + return (table); + else + table++; + return (NULL); +} + +static name2num_t * +name2num_lookup_num (name2num_t *table, int num) +{ + while (table->name != NULL) + if (num == table->num) + return (table); + else + table++; + return (NULL); +} + +int +ptl_name2nal (char *str) +{ + name2num_t *e = name2num_lookup_name (nalnames, str); + + return ((e == NULL) ? 0 : e->num); +} + +static char * +nal2name (int nal) +{ + name2num_t *e = name2num_lookup_num (nalnames, nal); + + return ((e == NULL) ? "???" : e->name); +} + +static int +nid2nal (ptl_nid_t nid) +{ + /* BIG pragmatic assumption */ + return ((((__u32)nid) & 0xffff0000) != 0 ? SOCKNAL : QSWNAL); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + struct hostent *he; + int a; + int b; + int c; + int d; + + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) + { + __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; + + *nidp = (ptl_nid_t)addr; + return (0); + } + + if ((('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) && + (he = gethostbyname (str)) != NULL) + { + __u32 addr = *(__u32 *)he->h_addr; + + *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + return (0); + } + + if (sscanf (str, "%i", &a) == 1) + { + *nidp = (ptl_nid_t)a; + return (0); + } + + if (sscanf (str, "%x", &a) == 1) + { + *nidp = (ptl_nid_t) a; + return (0); + } + + return (-1); +} + +char * +ptl_nid2str (char *buffer, ptl_nid_t nid) +{ + switch (nid2nal(nid)) + { + case QSWNAL: + sprintf (buffer, LPD64, nid); + return (buffer); + + case SCIMACNAL: + sprintf (buffer, LPX64, nid); + return (buffer); + + case SOCKNAL: { + __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ + struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); + + if (he != NULL) + strcpy (buffer, he->h_name); + else + { + addr = (__u32)nid; + sprintf (buffer, "%d.%d.%d.%d", + (addr>>24)&0xff, (addr>>16)&0xff, (addr>>8)&0xff, addr&0xff); + } + return (buffer); + } + + default: + sprintf (buffer, "nid2nal broken"); + return (buffer); + } +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int ptl_initialize(int argc, char **argv) +{ + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + return 0; +} + + +int jt_ptl_network(int argc, char **argv) +{ + int nal; + + if (argc != 2 || + (nal = ptl_name2nal (argv[1])) == 0) + { + name2num_t *entry; + + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + } + else + g_nal = nal; + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +int jt_ptl_connect(int argc, char **argv) +{ + if (argc < 2) { + usage: + fprintf(stderr, "usage: %s or \n", + argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + ptl_nid_t peer_nid; + struct hostent *he; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int o; + int olen; + + if (argc < 3) { + goto usage; + } + + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + g_port = atol(argv[2]); + + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); + return (-1); + } + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(g_port); + srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", + strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_rxmem != 0) + { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_txmem != 0) + { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", + strerror(errno)); + return -1; + } + + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (xchange_nids) { + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) + { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); + close (fd); + return (-1); + } + + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (fd); + return (-1); + } + } + else + peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: " + "%s\n", strerror(errno)); + close (fd); + return -1; + } + + g_nid = peer_nid; + printf("Connection to "LPX64" registered with socknal\n", g_nid); + + rc = close(fd); + if (rc) { + fprintf(stderr, "close failed: %d\n", rc); + } + } else if (g_nal == QSWNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == GMNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == SCIMACNAL) { + unsigned int tmpnid; + if(sscanf(argv[1], "%x", &tmpnid) == 1) { + g_nid=tmpnid; + } + else { + fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + } + + + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + } + + return 0; +} + +int jt_ptl_disconnect(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Disconnecting ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'disconnect' doesn't make any sense for " + "elan.\n"); + } else if (g_nal == GMNAL) { + printf("'disconnect' doesn't make any sense for " + "GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'disconnect' doesn't make any sense for " + "SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_push_connection (int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Pushing ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'push' doesn't make any sense for elan.\n"); + } else if (g_nal == GMNAL) { + printf("'push' doesn't make any sense for GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'push' doesn't make any sense for SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_ping(int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + long count = 1; + long size = 4; + long timeout = 1; + struct portal_ioctl_data data; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc > 2) + { + count = atol(argv[2]); + + if (count < 0 || count > 20000) + { + fprintf(stderr, "are you insane? %ld is a crazy count.\n", count); + return -1; + } + } + + if (argc > 3) + size= atol(argv[3]); + + if (argc > 4) + timeout = atol (argv[4]); + + PORTAL_IOC_INIT (data); + data.ioc_count = count; + data.ioc_size = size; + data.ioc_nid = nid; + data.ioc_nal = g_nal; + data.ioc_timeout = timeout; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); + if (rc) { + fprintf(stderr, "failed to start pinger: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_ptl_mynid(int argc, char **argv) +{ + int rc; + struct hostent *h; + char buf[1024], *hostname; + struct portal_ioctl_data data; + ptl_nid_t mynid; + + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + fprintf(stderr, "hostname defaults to the hostname of the " + "machine.\n"); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (g_nal == QSWNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for elan.\n"); + return -1; + } else if (g_nal == GMNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for GM.\n"); + return -1; + } else if (g_nal == SCIMACNAL) { + fprintf(stderr, "'mynid' doesn't make any sense for SCI.\n"); + return -1; + } + + if (g_nal != SOCKNAL && g_nal != TOENAL) { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + if (argc == 1) { + if (gethostname(buf, sizeof(buf)) != 0) { + fprintf(stderr, "gethostname failed: %s\n", + strerror(errno)); + return -1; + } + hostname = buf; + } else { + hostname = argv[1]; + } + + h = gethostbyname(hostname); + + if (!h) { + fprintf(stderr, "cannot get address for host '%s': %d\n", + hostname, h_errno); + return -1; + } + mynid = (ptl_nid_t)ntohl (*(__u32 *)h->h_addr); /* HOST byte order */ + + PORTAL_IOC_INIT(data); + data.ioc_nid = mynid; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc < 0) + fprintf(stderr, "IOC_PORTAL_REGISTER_MYNID failed: %s\n", + strerror(errno)); + else + printf("registered my nid "LPX64" (%s)\n", mynid, hostname); + return 0; +} + +int +jt_ptl_fail_nid (int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + unsigned int threshold; + struct portal_ioctl_data data; + + if (argc < 2 || argc > 3) + { + fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (!strcmp (argv[1], "_all_")) + nid = PTL_NID_ANY; + else if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc < 3) + threshold = PTL_MD_THRESH_INF; + else if (sscanf (argv[2], "%i", &threshold) != 1) { + fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); + return (-1); + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_count = threshold; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); + if (rc < 0) + fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", + strerror (errno)); + else + printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); + + return (0); +} + +int +jt_ptl_rxmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + + g_socket_rxmem = size; + } + printf ("Socket rmem = %d\n", g_socket_rxmem); + return (0); +} + +int +jt_ptl_txmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + g_socket_txmem = size; + } + printf ("Socket txmem = %d\n", g_socket_txmem); + return (0); +} + +int +jt_ptl_nagle (int argc, char **argv) +{ + int enable; + + if (argc > 1) + { + if (Parser_bool (&enable, argv[1]) != 0) + { + fprintf (stderr, "Can't parse boolean %s\n", argv[1]); + return (0); + } + g_socket_nonagle = !enable; + } + printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled"); + return (0); +} + +int +jt_ptl_add_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid1; + ptl_nid_t nid2; + ptl_nid_t gateway_nid; + int gateway_nal; + int rc; + + if (argc < 3) + { + fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); + return (-1); + } + + gateway_nal = nid2nal (gateway_nid); + + if (ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) + nid2 = nid1; + else if (ptl_parse_nid (&nid2, argv[3]) != 0) + { + fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = gateway_nid; + data.ioc_nal = gateway_nal; + data.ioc_nid2 = MIN (nid1, nid2); + data.ioc_nid3 = MAX (nid1, nid2); + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_del_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + int rc; + + if (argc < 2) + { + fprintf (stderr, "usage: %s targetNID\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_print_routes (int argc, char **argv) +{ + char buffer[3][128]; + struct portal_ioctl_data data; + int rc; + int index; + int gateway_nal; + ptl_nid_t gateway_nid; + ptl_nid_t nid1; + ptl_nid_t nid2; + + + for (index = 0;;index++) + { + PORTAL_IOC_INIT(data); + data.ioc_count = index; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data); + if (rc != 0) + break; + + gateway_nal = data.ioc_nal; + gateway_nid = data.ioc_nid; + nid1 = data.ioc_nid2; + nid2 = data.ioc_nid3; + + printf ("%8s %18s : %s - %s\n", + nal2name (gateway_nal), + ptl_nid2str (buffer[0], gateway_nid), + ptl_nid2str (buffer[1], nid1), + ptl_nid2str (buffer[2], nid2)); + } + return (0); +} + diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c new file mode 100644 index 0000000..d38bd4a --- /dev/null +++ b/lustre/portals/utils/ptlctl.c @@ -0,0 +1,64 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include + +#include "parser.h" + + +command_t list[] = { + {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: | for tcp/elan respectively)"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, + {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, + {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, + {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, + {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (ptl_initialize(argc, argv) < 0) + exit(1); + + Parser_init("ptlctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + return 0; +} diff --git a/lustre/portals/utils/routerstat.c b/lustre/portals/utils/routerstat.c new file mode 100644 index 0000000..37da12c --- /dev/null +++ b/lustre/portals/utils/routerstat.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +double +timenow () +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000.0); +} + +void +do_stat (int fd) +{ + static char buffer[1024]; + static double last = 0.0; + double now; + double t; + long long bytes; + long packets; + long errors; + long depth; + int n; + + lseek (fd, 0, SEEK_SET); + now = timenow(); + n = read (fd, buffer, sizeof (buffer)); + if (n < 0) + { + fprintf (stderr, "Can't read statfile\n"); + exit (1); + } + buffer[n] = 0; + + n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth); + + if (n < 3) + { + fprintf (stderr, "Can't parse statfile\n"); + exit (1); + } + + if (last == 0.0) + printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", + bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors); + else + { + t = now - last; + + printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", + bytes, ((double)bytes)/((1<<20) * t), + packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t), + errors, (long)(errors/t)); + } + + if (n == 4) + printf (" (%ld)\n", depth); + else + printf ("\n"); + + fflush (stdout); + + lseek (fd, 0, SEEK_SET); + write (fd, "\n", 1); + last = timenow(); +} + +int main (int argc, char **argv) +{ + int interval = 0; + int fd; + + if (argc > 1) + interval = atoi (argv[1]); + + fd = open ("/proc/sys/portals/router", O_RDWR); + if (fd < 0) + { + fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); + return (1); + } + + do_stat (fd); + if (interval == 0) + return (0); + + for (;;) + { + sleep (interval); + do_stat (fd); + } +}