Whamcloud - gitweb
- add portals to Lustre
authorbraam <braam>
Mon, 19 May 2003 04:27:44 +0000 (04:27 +0000)
committerbraam <braam>
Mon, 19 May 2003 04:27:44 +0000 (04:27 +0000)
- fix up makefiles
- add initial part of 2.5 in kernel make infrastructure
- cleanup tcpnal prototypes etc.

331 files changed:
lnet/AUTHORS [new file with mode: 0644]
lnet/ChangeLog [new file with mode: 0644]
lnet/Kernelenv.in [new file with mode: 0644]
lnet/Kernelenv.mk [new file with mode: 0644]
lnet/Makefile.am [new file with mode: 0644]
lnet/Makefile.mk [new file with mode: 0644]
lnet/NEWS [new file with mode: 0644]
lnet/README [new file with mode: 0644]
lnet/Rules.linux.in [new file with mode: 0644]
lnet/archdep.m4 [new file with mode: 0644]
lnet/autogen.sh [new file with mode: 0644]
lnet/build.m4 [new file with mode: 0644]
lnet/configure.in [new file with mode: 0644]
lnet/doc/.cvsignore [new file with mode: 0644]
lnet/doc/Data-structures [new file with mode: 0644]
lnet/doc/Makefile.am [new file with mode: 0644]
lnet/doc/Message-life-cycle [new file with mode: 0644]
lnet/doc/NAL-HOWTO [new file with mode: 0644]
lnet/doc/file.fig [new file with mode: 0644]
lnet/doc/flow_new.fig [new file with mode: 0644]
lnet/doc/get.fig [new file with mode: 0644]
lnet/doc/ieee.bst [new file with mode: 0644]
lnet/doc/mpi.fig [new file with mode: 0644]
lnet/doc/portals.fig [new file with mode: 0644]
lnet/doc/portals3.bib [new file with mode: 0644]
lnet/doc/portals3.lyx [new file with mode: 0644]
lnet/doc/put.fig [new file with mode: 0644]
lnet/include/Makefile.am [new file with mode: 0644]
lnet/include/config.h.in [new file with mode: 0644]
lnet/include/linux/Makefile.am [new file with mode: 0644]
lnet/include/linux/kp30.h [new file with mode: 0644]
lnet/include/linux/portals_lib.h [new file with mode: 0644]
lnet/include/lnet/Makefile.am [new file with mode: 0644]
lnet/include/lnet/api-support.h [new file with mode: 0644]
lnet/include/lnet/api.h [new file with mode: 0644]
lnet/include/lnet/arg-blocks.h [new file with mode: 0644]
lnet/include/lnet/defines.h [new file with mode: 0644]
lnet/include/lnet/errno.h [new file with mode: 0644]
lnet/include/lnet/internal.h [new file with mode: 0644]
lnet/include/lnet/lib-dispatch.h [new file with mode: 0644]
lnet/include/lnet/lib-lnet.h [new file with mode: 0644]
lnet/include/lnet/lib-nal.h [new file with mode: 0644]
lnet/include/lnet/lib-p30.h [new file with mode: 0644]
lnet/include/lnet/lib-types.h [new file with mode: 0644]
lnet/include/lnet/list.h [new file with mode: 0644]
lnet/include/lnet/lltrace.h [new file with mode: 0644]
lnet/include/lnet/lnet.h [new file with mode: 0644]
lnet/include/lnet/lnetctl.h [new file with mode: 0644]
lnet/include/lnet/myrnal.h [new file with mode: 0644]
lnet/include/lnet/nal.h [new file with mode: 0644]
lnet/include/lnet/nalids.h [new file with mode: 0644]
lnet/include/lnet/p30.h [new file with mode: 0644]
lnet/include/lnet/ppid.h [new file with mode: 0644]
lnet/include/lnet/ptlctl.h [new file with mode: 0644]
lnet/include/lnet/stringtab.h [new file with mode: 0644]
lnet/include/lnet/types.h [new file with mode: 0644]
lnet/klnds/Makefile.am [new file with mode: 0644]
lnet/klnds/Makefile.mk [new file with mode: 0644]
lnet/klnds/gmlnd/Makefile.am [new file with mode: 0644]
lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch [new file with mode: 0644]
lnet/klnds/gmlnd/gmlnd.h [new file with mode: 0644]
lnet/klnds/gmlnd/gmlnd_cb.c [new file with mode: 0644]
lnet/klnds/gmlnd/gmnal.c [new file with mode: 0644]
lnet/klnds/qswlnd/Makefile.am [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd.c [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd.h [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd_cb.c [new file with mode: 0644]
lnet/klnds/scimaclnd/Makefile.am [new file with mode: 0644]
lnet/klnds/scimaclnd/README.scimacnal [new file with mode: 0644]
lnet/klnds/scimaclnd/scimac.conf [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal.c [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal.h [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal_cb.c [new file with mode: 0644]
lnet/klnds/socklnd/Makefile.am [new file with mode: 0644]
lnet/klnds/socklnd/Makefile.mk [new file with mode: 0644]
lnet/klnds/socklnd/socklnd.c [new file with mode: 0644]
lnet/klnds/socklnd/socklnd.h [new file with mode: 0644]
lnet/klnds/socklnd/socklnd_cb.c [new file with mode: 0644]
lnet/klnds/toelnd/Makefile.am [new file with mode: 0644]
lnet/klnds/toelnd/toenal.c [new file with mode: 0644]
lnet/klnds/toelnd/toenal.h [new file with mode: 0644]
lnet/klnds/toelnd/toenal_cb.c [new file with mode: 0644]
lnet/libcfs/Makefile.am [new file with mode: 0644]
lnet/libcfs/Makefile.mk [new file with mode: 0644]
lnet/libcfs/debug.c [new file with mode: 0644]
lnet/libcfs/module.c [new file with mode: 0644]
lnet/libcfs/proc.c [new file with mode: 0644]
lnet/lnet/Makefile.am [new file with mode: 0644]
lnet/lnet/Makefile.mk [new file with mode: 0644]
lnet/lnet/api-eq.c [new file with mode: 0644]
lnet/lnet/api-errno.c [new file with mode: 0644]
lnet/lnet/api-init.c [new file with mode: 0644]
lnet/lnet/api-md.c [new file with mode: 0644]
lnet/lnet/api-me.c [new file with mode: 0644]
lnet/lnet/api-ni.c [new file with mode: 0644]
lnet/lnet/api-wrap.c [new file with mode: 0644]
lnet/lnet/lib-dispatch.c [new file with mode: 0644]
lnet/lnet/lib-eq.c [new file with mode: 0644]
lnet/lnet/lib-init.c [new file with mode: 0644]
lnet/lnet/lib-md.c [new file with mode: 0644]
lnet/lnet/lib-me.c [new file with mode: 0644]
lnet/lnet/lib-move.c [new file with mode: 0644]
lnet/lnet/lib-msg.c [new file with mode: 0644]
lnet/lnet/lib-ni.c [new file with mode: 0644]
lnet/lnet/lib-not-impl.c [new file with mode: 0644]
lnet/lnet/lib-pid.c [new file with mode: 0644]
lnet/packaging/.cvsignore [new file with mode: 0644]
lnet/packaging/Makefile.am [new file with mode: 0644]
lnet/packaging/portals.spec.in [new file with mode: 0644]
lnet/router/Makefile.am [new file with mode: 0644]
lnet/router/Makefile.mk [new file with mode: 0644]
lnet/router/proc.c [new file with mode: 0644]
lnet/router/router.c [new file with mode: 0644]
lnet/router/router.h [new file with mode: 0644]
lnet/tests/.cvsignore [new file with mode: 0644]
lnet/tests/Makefile.am [new file with mode: 0644]
lnet/tests/ping.h [new file with mode: 0644]
lnet/tests/ping_cli.c [new file with mode: 0644]
lnet/tests/ping_srv.c [new file with mode: 0644]
lnet/tests/sping_cli.c [new file with mode: 0644]
lnet/tests/sping_srv.c [new file with mode: 0644]
lnet/tests/startclient.sh [new file with mode: 0644]
lnet/tests/startserver.sh [new file with mode: 0644]
lnet/tests/stopclient.sh [new file with mode: 0644]
lnet/tests/stopserver.sh [new file with mode: 0644]
lnet/ulnds/Makefile.am [new file with mode: 0644]
lnet/ulnds/README [new file with mode: 0644]
lnet/ulnds/address.c [new file with mode: 0644]
lnet/ulnds/bridge.h [new file with mode: 0644]
lnet/ulnds/connection.c [new file with mode: 0644]
lnet/ulnds/connection.h [new file with mode: 0644]
lnet/ulnds/debug.c [new file with mode: 0644]
lnet/ulnds/dispatch.h [new file with mode: 0644]
lnet/ulnds/ipmap.h [new file with mode: 0644]
lnet/ulnds/pqtimer.c [new file with mode: 0644]
lnet/ulnds/pqtimer.h [new file with mode: 0644]
lnet/ulnds/procapi.c [new file with mode: 0644]
lnet/ulnds/procbridge.h [new file with mode: 0644]
lnet/ulnds/proclib.c [new file with mode: 0644]
lnet/ulnds/select.c [new file with mode: 0644]
lnet/ulnds/socklnd/Makefile.am [new file with mode: 0644]
lnet/ulnds/socklnd/README [new file with mode: 0644]
lnet/ulnds/socklnd/address.c [new file with mode: 0644]
lnet/ulnds/socklnd/bridge.h [new file with mode: 0644]
lnet/ulnds/socklnd/connection.c [new file with mode: 0644]
lnet/ulnds/socklnd/connection.h [new file with mode: 0644]
lnet/ulnds/socklnd/debug.c [new file with mode: 0644]
lnet/ulnds/socklnd/dispatch.h [new file with mode: 0644]
lnet/ulnds/socklnd/ipmap.h [new file with mode: 0644]
lnet/ulnds/socklnd/pqtimer.c [new file with mode: 0644]
lnet/ulnds/socklnd/pqtimer.h [new file with mode: 0644]
lnet/ulnds/socklnd/procapi.c [new file with mode: 0644]
lnet/ulnds/socklnd/procbridge.h [new file with mode: 0644]
lnet/ulnds/socklnd/proclib.c [new file with mode: 0644]
lnet/ulnds/socklnd/select.c [new file with mode: 0644]
lnet/ulnds/socklnd/table.c [new file with mode: 0644]
lnet/ulnds/socklnd/table.h [new file with mode: 0644]
lnet/ulnds/socklnd/tcplnd.c [new file with mode: 0644]
lnet/ulnds/socklnd/timer.h [new file with mode: 0644]
lnet/ulnds/socklnd/utypes.h [new file with mode: 0644]
lnet/ulnds/table.c [new file with mode: 0644]
lnet/ulnds/table.h [new file with mode: 0644]
lnet/ulnds/tcplnd.c [new file with mode: 0644]
lnet/ulnds/timer.h [new file with mode: 0644]
lnet/ulnds/utypes.h [new file with mode: 0644]
lnet/utils/.cvsignore [new file with mode: 0644]
lnet/utils/Makefile.am [new file with mode: 0644]
lnet/utils/acceptor.c [new file with mode: 0644]
lnet/utils/debug.c [new file with mode: 0644]
lnet/utils/debugctl.c [new file with mode: 0644]
lnet/utils/l_ioctl.c [new file with mode: 0644]
lnet/utils/parser.c [new file with mode: 0644]
lnet/utils/parser.h [new file with mode: 0644]
lnet/utils/portals.c [new file with mode: 0644]
lnet/utils/ptlctl.c [new file with mode: 0644]
lnet/utils/routerstat.c [new file with mode: 0644]
lustre/Makefile.mk [new file with mode: 0644]
lustre/mds/Makefile.mk [new file with mode: 0644]
lustre/portals/AUTHORS [new file with mode: 0644]
lustre/portals/ChangeLog [new file with mode: 0644]
lustre/portals/Kernelenv.in [new file with mode: 0644]
lustre/portals/Kernelenv.mk [new file with mode: 0644]
lustre/portals/Makefile.am [new file with mode: 0644]
lustre/portals/Makefile.mk [new file with mode: 0644]
lustre/portals/NEWS [new file with mode: 0644]
lustre/portals/README [new file with mode: 0644]
lustre/portals/Rules.linux.in [new file with mode: 0644]
lustre/portals/archdep.m4 [new file with mode: 0644]
lustre/portals/autogen.sh [new file with mode: 0755]
lustre/portals/build.m4 [new file with mode: 0644]
lustre/portals/configure.in [new file with mode: 0644]
lustre/portals/doc/.cvsignore [new file with mode: 0644]
lustre/portals/doc/Data-structures [new file with mode: 0644]
lustre/portals/doc/Makefile.am [new file with mode: 0644]
lustre/portals/doc/Message-life-cycle [new file with mode: 0644]
lustre/portals/doc/NAL-HOWTO [new file with mode: 0644]
lustre/portals/doc/file.fig [new file with mode: 0644]
lustre/portals/doc/flow_new.fig [new file with mode: 0644]
lustre/portals/doc/get.fig [new file with mode: 0644]
lustre/portals/doc/ieee.bst [new file with mode: 0644]
lustre/portals/doc/mpi.fig [new file with mode: 0644]
lustre/portals/doc/portals.fig [new file with mode: 0644]
lustre/portals/doc/portals3.bib [new file with mode: 0644]
lustre/portals/doc/portals3.lyx [new file with mode: 0644]
lustre/portals/doc/put.fig [new file with mode: 0644]
lustre/portals/include/Makefile.am [new file with mode: 0644]
lustre/portals/include/config.h.in [new file with mode: 0644]
lustre/portals/include/linux/Makefile.am [new file with mode: 0644]
lustre/portals/include/linux/kp30.h [new file with mode: 0644]
lustre/portals/include/linux/portals_lib.h [new file with mode: 0644]
lustre/portals/include/portals/Makefile.am [new file with mode: 0644]
lustre/portals/include/portals/api-support.h [new file with mode: 0644]
lustre/portals/include/portals/api.h [new file with mode: 0644]
lustre/portals/include/portals/arg-blocks.h [new file with mode: 0644]
lustre/portals/include/portals/defines.h [new file with mode: 0644]
lustre/portals/include/portals/errno.h [new file with mode: 0644]
lustre/portals/include/portals/internal.h [new file with mode: 0644]
lustre/portals/include/portals/lib-dispatch.h [new file with mode: 0644]
lustre/portals/include/portals/lib-nal.h [new file with mode: 0644]
lustre/portals/include/portals/lib-p30.h [new file with mode: 0644]
lustre/portals/include/portals/lib-types.h [new file with mode: 0644]
lustre/portals/include/portals/list.h [new file with mode: 0644]
lustre/portals/include/portals/lltrace.h [new file with mode: 0644]
lustre/portals/include/portals/myrnal.h [new file with mode: 0644]
lustre/portals/include/portals/nal.h [new file with mode: 0644]
lustre/portals/include/portals/nalids.h [new file with mode: 0644]
lustre/portals/include/portals/p30.h [new file with mode: 0644]
lustre/portals/include/portals/ppid.h [new file with mode: 0644]
lustre/portals/include/portals/ptlctl.h [new file with mode: 0644]
lustre/portals/include/portals/stringtab.h [new file with mode: 0644]
lustre/portals/include/portals/types.h [new file with mode: 0644]
lustre/portals/knals/Makefile.am [new file with mode: 0644]
lustre/portals/knals/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/gmnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal.c [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal.h [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal_cb.c [new file with mode: 0644]
lustre/portals/knals/qswnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal.c [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal.h [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal_cb.c [new file with mode: 0644]
lustre/portals/knals/scimacnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/scimacnal/README.scimacnal [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimac.conf [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal.c [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal.h [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal_cb.c [new file with mode: 0644]
lustre/portals/knals/socknal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/socknal/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/socknal/socknal.c [new file with mode: 0644]
lustre/portals/knals/socknal/socknal.h [new file with mode: 0644]
lustre/portals/knals/socknal/socknal_cb.c [new file with mode: 0644]
lustre/portals/knals/toenal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/toenal/toenal.c [new file with mode: 0644]
lustre/portals/knals/toenal/toenal.h [new file with mode: 0644]
lustre/portals/knals/toenal/toenal_cb.c [new file with mode: 0644]
lustre/portals/libcfs/Makefile.am [new file with mode: 0644]
lustre/portals/libcfs/Makefile.mk [new file with mode: 0644]
lustre/portals/libcfs/debug.c [new file with mode: 0644]
lustre/portals/libcfs/module.c [new file with mode: 0644]
lustre/portals/libcfs/proc.c [new file with mode: 0644]
lustre/portals/packaging/.cvsignore [new file with mode: 0644]
lustre/portals/packaging/Makefile.am [new file with mode: 0644]
lustre/portals/packaging/portals.spec.in [new file with mode: 0644]
lustre/portals/portals/Makefile.am [new file with mode: 0644]
lustre/portals/portals/Makefile.mk [new file with mode: 0644]
lustre/portals/portals/api-eq.c [new file with mode: 0644]
lustre/portals/portals/api-errno.c [new file with mode: 0644]
lustre/portals/portals/api-init.c [new file with mode: 0644]
lustre/portals/portals/api-md.c [new file with mode: 0644]
lustre/portals/portals/api-me.c [new file with mode: 0644]
lustre/portals/portals/api-ni.c [new file with mode: 0644]
lustre/portals/portals/api-wrap.c [new file with mode: 0644]
lustre/portals/portals/lib-dispatch.c [new file with mode: 0644]
lustre/portals/portals/lib-eq.c [new file with mode: 0644]
lustre/portals/portals/lib-init.c [new file with mode: 0644]
lustre/portals/portals/lib-md.c [new file with mode: 0644]
lustre/portals/portals/lib-me.c [new file with mode: 0644]
lustre/portals/portals/lib-move.c [new file with mode: 0644]
lustre/portals/portals/lib-msg.c [new file with mode: 0644]
lustre/portals/portals/lib-ni.c [new file with mode: 0644]
lustre/portals/portals/lib-not-impl.c [new file with mode: 0644]
lustre/portals/portals/lib-pid.c [new file with mode: 0644]
lustre/portals/router/Makefile.am [new file with mode: 0644]
lustre/portals/router/Makefile.mk [new file with mode: 0644]
lustre/portals/router/proc.c [new file with mode: 0644]
lustre/portals/router/router.c [new file with mode: 0644]
lustre/portals/router/router.h [new file with mode: 0644]
lustre/portals/tests/.cvsignore [new file with mode: 0644]
lustre/portals/tests/Makefile.am [new file with mode: 0644]
lustre/portals/tests/ping.h [new file with mode: 0644]
lustre/portals/tests/ping_cli.c [new file with mode: 0644]
lustre/portals/tests/ping_srv.c [new file with mode: 0644]
lustre/portals/tests/sping_cli.c [new file with mode: 0644]
lustre/portals/tests/sping_srv.c [new file with mode: 0644]
lustre/portals/tests/startclient.sh [new file with mode: 0755]
lustre/portals/tests/startserver.sh [new file with mode: 0755]
lustre/portals/tests/stopclient.sh [new file with mode: 0755]
lustre/portals/tests/stopserver.sh [new file with mode: 0644]
lustre/portals/unals/Makefile.am [new file with mode: 0644]
lustre/portals/unals/README [new file with mode: 0644]
lustre/portals/unals/address.c [new file with mode: 0644]
lustre/portals/unals/bridge.h [new file with mode: 0644]
lustre/portals/unals/connection.c [new file with mode: 0644]
lustre/portals/unals/connection.h [new file with mode: 0644]
lustre/portals/unals/debug.c [new file with mode: 0644]
lustre/portals/unals/dispatch.h [new file with mode: 0644]
lustre/portals/unals/ipmap.h [new file with mode: 0644]
lustre/portals/unals/pqtimer.c [new file with mode: 0644]
lustre/portals/unals/pqtimer.h [new file with mode: 0644]
lustre/portals/unals/procapi.c [new file with mode: 0644]
lustre/portals/unals/procbridge.h [new file with mode: 0644]
lustre/portals/unals/proclib.c [new file with mode: 0644]
lustre/portals/unals/select.c [new file with mode: 0644]
lustre/portals/unals/table.c [new file with mode: 0644]
lustre/portals/unals/table.h [new file with mode: 0644]
lustre/portals/unals/tcpnal.c [new file with mode: 0644]
lustre/portals/unals/timer.h [new file with mode: 0644]
lustre/portals/unals/utypes.h [new file with mode: 0644]
lustre/portals/utils/.cvsignore [new file with mode: 0644]
lustre/portals/utils/Makefile.am [new file with mode: 0644]
lustre/portals/utils/acceptor.c [new file with mode: 0644]
lustre/portals/utils/debug.c [new file with mode: 0644]
lustre/portals/utils/debugctl.c [new file with mode: 0644]
lustre/portals/utils/l_ioctl.c [new file with mode: 0644]
lustre/portals/utils/parser.c [new file with mode: 0644]
lustre/portals/utils/parser.h [new file with mode: 0644]
lustre/portals/utils/portals.c [new file with mode: 0644]
lustre/portals/utils/ptlctl.c [new file with mode: 0644]
lustre/portals/utils/routerstat.c [new file with mode: 0644]

diff --git a/lnet/AUTHORS b/lnet/AUTHORS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/ChangeLog b/lnet/ChangeLog
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/Kernelenv.in b/lnet/Kernelenv.in
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lnet/Kernelenv.mk b/lnet/Kernelenv.mk
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lnet/Makefile.am b/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..3c42103
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = Rules.linux archdep.m4 MCP
+DIST_SUBDIRS = libcfs portals knals unals utils tests doc router
+SUBDIRS = libcfs portals knals unals utils tests doc router
diff --git a/lnet/Makefile.mk b/lnet/Makefile.mk
new file mode 100644 (file)
index 0000000..be0e51a
--- /dev/null
@@ -0,0 +1,6 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += libcfs/
+obj-y += knals/
+obj-y += router/
diff --git a/lnet/NEWS b/lnet/NEWS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/README b/lnet/README
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/Rules.linux.in b/lnet/Rules.linux.in
new file mode 100644 (file)
index 0000000..8247deb
--- /dev/null
@@ -0,0 +1,37 @@
+# included in Linux kernel directories
+# Rules for module building
+
+MODLINK=@MOD_LINK@
+if LINUX25
+
+
+basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g')
+AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2  -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename)
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+
+
+else
+
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+
+
+endif
+
+
+tags:
+       rm -f $(top_srcdir)/TAGS
+       rm -f $(top_srcdir)/tags
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a
+
+
+
+
diff --git a/lnet/archdep.m4 b/lnet/archdep.m4
new file mode 100644 (file)
index 0000000..0315644
--- /dev/null
@@ -0,0 +1,206 @@
+
+# -------- in kernel compilation? (2.5 only) -------------
+AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles])
+AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
+echo "Makefile for in kernel build: $INKERNEL"
+
+# -------- liblustre compilation --------------
+AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
+
+# -------- set linuxdir ------------
+
+AC_ARG_WITH(linux, [  --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux)
+AC_SUBST(LINUX)
+
+# --------- UML?  --------------------
+AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
+if test $host_cpu = "lib" ; then 
+        host_cpu="lib"
+       AC_MSG_RESULT(no building Lustre library)
+else
+  if test -e $LINUX/include/asm-um ; then
+    if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
+       host_cpu="um";
+       AC_MSG_RESULT(yes)
+    else
+       AC_MSG_RESULT(no (asm doesn't point at asm-um))
+    fi
+
+  else 
+        AC_MSG_RESULT(no (asm-um missing))
+  fi
+fi
+
+# --------- Linux 25 ------------------
+
+AC_MSG_CHECKING(if you are running linux 2.5)
+if test -e $LINUX/include/linux/namei.h ; then
+        linux25="yes"
+        AC_MSG_RESULT(yes)
+else
+        linux25="no"
+        AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
+echo "Makefiles for in linux 2.5 build: $LINUX25"
+
+# -------  Makeflags ------------------
+
+AC_MSG_CHECKING(setting make flags system architecture: )
+case ${host_cpu} in
+       lib )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall '
+       KCPPFLAGS='-D__arch_lib__ '
+        MOD_LINK=elf_i386
+;;
+       um )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include '
+        ;;
+        esac
+
+        MOD_LINK=elf_i386
+;;
+       i*86 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        ;;
+        esac
+        MOD_LINK=elf_i386
+;;
+
+       alphaev6 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alphaev67 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alpha* )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       ia64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step'
+       KCPPFLAGS='-D__KERNEL__ -DMODULE'
+        MOD_LINK=elf64_ia64
+;;
+
+       sparc64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf64_sparc
+
+;;
+
+       powerpc )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf32ppclinux
+;;
+
+        *)
+       AC_ERROR("Unknown Linux Platform: $host_cpu")
+;;
+esac
+
+# ----------- make dep run? ------------------
+
+if test $host_cpu != "lib" ; then 
+  AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
+  if test -f $LINUX/include/linux/config.h ; then
+  AC_MSG_RESULT(yes)
+ else
+  AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
+  fi
+fi
+
+# ------------ include paths ------------------
+
+if test $host_cpu != "lib" ; then 
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include -I$(LINUX)/include'
+else
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include'
+fi
+CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
+
+if test $host_cpu != "lib" ; then 
+# ------------ autoconf.h ------------------
+  AC_MSG_CHECKING(if autoconf.h is in kernel source)
+  if test -f $LINUX/include/linux/autoconf.h ; then
+      AC_MSG_RESULT(yes)
+  else
+      AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
+  fi
+
+# ------------ RELEASE and moduledir ------------------
+  AC_MSG_CHECKING(for Linux release)
+  
+  dnl We need to rid ourselves of the nasty [ ] quotes.
+  changequote(, )
+  dnl Get release from version.h
+  RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`"
+  changequote([, ])
+  
+  moduledir='$(libdir)/modules/'$RELEASE/kernel
+  AC_SUBST(moduledir)
+  
+  modulefsdir='$(moduledir)/fs/$(PACKAGE)'
+  AC_SUBST(modulefsdir)
+  
+  AC_MSG_RESULT($RELEASE)
+  AC_SUBST(RELEASE)
+
+# ---------- modversions? --------------------
+  AC_MSG_CHECKING(for MODVERSIONS)
+  if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
+  then
+        MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB"
+        AC_MSG_RESULT(yes)
+  else
+        MFLAGS=
+        AC_MSG_RESULT(no)
+  fi
+fi
+
+# ---------- SMP -------------------
+#AC_MSG_CHECKING(for SMP)
+#if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then
+#        SMPFLAG=
+#        AC_MSG_RESULT(yes)
+#else
+#        SMPFLAG=
+#        AC_MSG_RESULT(no)
+#fi
+
+CFLAGS="$KCFLAGS"
+CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS "
+
+AC_SUBST(MOD_LINK)
+AC_SUBST(LINUX25)
\ No newline at end of file
diff --git a/lnet/autogen.sh b/lnet/autogen.sh
new file mode 100644 (file)
index 0000000..9deed73
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+aclocal &&
+automake --add-missing &&
+${AUTOCONF:-autoconf}
diff --git a/lnet/build.m4 b/lnet/build.m4
new file mode 100644 (file)
index 0000000..4e8dbbb
--- /dev/null
@@ -0,0 +1,108 @@
+
+# ----------  directories ---------
+
+
+# ---------  unsigned long long sane? -------
+
+AC_CHECK_SIZEOF(unsigned long long, 0)
+echo "---> size SIZEOF $SIZEOF_unsigned_long_long"
+echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long"
+if test $ac_cv_sizeof_unsigned_long_long != 8 ; then
+        AC_MSG_ERROR([** we assume that sizeof(long long) == 8.  Tell phil@clusterfs.com])
+fi
+
+# directories for binaries
+ac_default_prefix=
+bindir='${exec_prefix}/usr/bin'
+sbindir='${exec_prefix}/usr/sbin'
+includedir='${prefix}/usr/include'
+
+# Directories for documentation and demos.
+docdir='${prefix}/usr/share/doc/$(PACKAGE)'
+AC_SUBST(docdir)
+demodir='$(docdir)/demo'
+AC_SUBST(demodir)
+pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples'
+AC_SUBST(pkgexampledir)
+pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre'
+AC_SUBST(pymoddir)
+modulenetdir='$(moduledir)/net/$(PACKAGE)'
+AC_SUBST(modulenetdir)
+
+
+# ----------  BAD gcc? ------------
+AC_PROG_RANLIB
+AC_PROG_CC
+AC_MSG_CHECKING(for buggy compiler)
+CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"`
+bad_cc() {
+       echo
+       echo "   '$CC_VERSION'"
+       echo "  has been known to generate bad code, "
+       echo "  please get an updated compiler."
+       AC_MSG_ERROR(sorry)
+}
+TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
+if test "$TMP_VERSION" = "gcc version 2.95"; then
+        bad_cc
+fi
+case "$CC_VERSION" in 
+       # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
+       # without "sub    $0xc,%esp" to protect the stack from being
+       # stomped on by interrupts (bug 606)
+       "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)")
+               bad_cc
+               ;;
+       # mandrake's similar sub 0xc compiler bug
+       # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2
+       "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
+               bad_cc
+               ;;
+       *)
+               AC_MSG_RESULT(no known problems)
+               ;;
+esac
+# end ------  BAD gcc? ------------
+
+# --------  Check for required packages  --------------
+
+# this doesn't seem to work on older autoconf
+# AC_CHECK_LIB(readline, readline,,)
+AC_ARG_ENABLE(readline,        [  --enable-readline  use readline library],,
+                       enable_readline="yes")
+if test "$enable_readline" = "yes" ; then
+   LIBREADLINE="-lreadline -lncurses"
+   HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1"
+else 
+   LIBREADLINE=""
+   HAVE_LIBREADLINE=""
+fi
+AC_SUBST(LIBREADLINE)
+AC_SUBST(HAVE_LIBREADLINE)
+
+AC_ARG_ENABLE(efence,  [  --enable-efence  use efence library],,
+                       enable_efence="no")
+if test "$enable_efence" = "yes" ; then
+   LIBEFENCE="-lefence"
+   HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1"
+else 
+   LIBEFENCE=""
+   HAVE_LIBEFENCE=""
+fi
+AC_SUBST(LIBEFENCE)
+AC_SUBST(HAVE_LIBEFENCE)
+
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
+AC_MSG_CHECKING(if you are building lib lustre)
+if test "$host_cpu" = "lib"; then
+   AC_MSG_RESULT(yes)
+   libdir='${exec_prefix}/lib/lustre'
+else
+   AC_MSG_RESULT(no)
+fi
+
+# end -------- Kernel build environment. -----------------
+
+
diff --git a/lnet/configure.in b/lnet/configure.in
new file mode 100644 (file)
index 0000000..7c32246
--- /dev/null
@@ -0,0 +1,38 @@
+# This version is here to make autoconf happy; the name is a file which is
+# "unique" to this directory so that configure knows where it should run.
+AC_INIT(knals/Makefile.am, 3.0)
+AC_CANONICAL_SYSTEM
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+# Automake variables.  Steal the version number from packaging/intersync.spec
+AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c]))
+# AM_MAINTAINER_MODE
+
+sinclude(archdep.m4)
+sinclude(build.m4)
+sinclude<portalsconf.m4)
+
+if test x$enable_inkernel = xyes ; then
+cp Kernelenv.mk Kernelenv.in
+cp Makefile.mk Makefile.in
+cp libcfs/Makefile.mk libcfs/Makefile.in
+cp portals/Makefile.mk portals/Makefile.in
+cp knals/Makefile.mk knals/Makefile.in
+cp knals/socknal/Makefile.mk knals/socknal/Makefile.in
+cp router/Makefile.mk router/Makefile.in
+AC_OUTPUT(Kernelenv)
+fi
+
+
+AM_CONFIG_HEADER(include/config.h)
+
+AC_OUTPUT([Rules.linux Makefile libcfs/Makefile portals/Makefile \
+          unals/Makefile knals/Makefile router/Makefile \
+         knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \
+         knals/scimacnal/Makefile knals/toenal/Makefile \
+          utils/Makefile tests/Makefile doc/Makefile \
+          packaging/Makefile packaging/portals.spec ])
+
diff --git a/lnet/doc/.cvsignore b/lnet/doc/.cvsignore
new file mode 100644 (file)
index 0000000..827dca4
--- /dev/null
@@ -0,0 +1,4 @@
+Makefile
+Makefile.in
+*.eps
+*.pdf
diff --git a/lnet/doc/Data-structures b/lnet/doc/Data-structures
new file mode 100644 (file)
index 0000000..b5532b1
--- /dev/null
@@ -0,0 +1,65 @@
+In this document I will try to draw the data structures and how they
+interrelate in the Portals 3 reference implementation.  It is probably
+best shown with a drawing, so there may be an additional xfig or
+Postscript figure.
+
+
+MEMORY POOLS:
+------------
+
+First, a digression on memory allocation in the library.  As mentioned
+in the NAL Writer's Guide, the library does not link against any
+standard C libraries and as such is unable to dynamically allocate
+memory on its own.  It requires that the NAL implement a method
+for allocation that is appropriate for the protection domain in
+which the library lives.  This is only called when a network
+interface is initialized to allocate the Portals object pools.
+
+These pools are preallocate blocks of objects that the library
+can rapidly make active and manage with a minimum of overhead.
+It is also cuts down on overhead for setting up structures
+since the NAL->malloc() callback does not need to be called
+for each object.
+
+The objects are maintained on a per-object type singly linked free
+list and contain a pointer to the next free object.  This pointer
+is NULL if the object is not on the free list and is non-zero
+if it is on the list.  The special sentinal value of 0xDEADBEEF
+is used to mark the end of the free list since NULL could
+indicate that the last object in the list is not free.
+
+When one of the lib_*_alloc() functions is called, the library
+returns the head of the free list and advances the head pointer
+to the next item on the list.  The special case of 0xDEADBEEF is
+checked and a NULL pointer is returned if there are no more
+objects of this type available.   The lib_*_free() functions
+are even simpler -- check to ensure that the object is not already
+free, set its next pointer to the current head and then set
+the head to be this newly freed object.
+
+Since C does not have templates, I did the next best thing and wrote
+the memory pool allocation code as a macro that expands based on the
+type of the argument.  The mk_alloc(T) macro expands to
+write the _lib_T_alloc() and lib_T_free() functions.
+It requires that the object have a pointer of the type T named
+"next_free".  There are also functions that map _lib_T_alloc()
+to lib_T_alloc() so that the library can add some extra
+functionality to the T constructor.
+
+
+
+LINKED LISTS:
+------------
+
+Many of the active Portals objects are stored in doubly linked lists
+when they are active.  These are always implemented with the pointer
+to the next object and a pointer to the next pointer of the
+previous object.  This avoids the "dummy head" object or
+special cases for inserting at the beginning or end of the list.
+The pointer manipulations are a little hairy at times, but
+I hope that they are understandable.
+
+The actual linked list code is implemented as macros in <lib-p30.h>,
+although the object has to know about 
+
+
diff --git a/lnet/doc/Makefile.am b/lnet/doc/Makefile.am
new file mode 100644 (file)
index 0000000..7c65e6c
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+LYX2PDF = lyx --export pdf
+LYX2TXT = lyx --export text
+LYX2HTML = lyx --export html
+SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps
+
+DOCS = portals3.pdf 
+IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps
+LYXFILES= portals3.lyx
+
+MAINTAINERCLEANFILES =  $(IMAGES) $(DOCS) $(GENERATED)
+GENERATED = 
+EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) 
+
+all: $(DOCS)
+
+# update date and version in document
+date := $(shell date +%x)
+tag := $(shell echo '$$Name:  $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/')
+addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g'
+
+# Regenerate when the $(VERSION) or $Name:  $ changes.
+.INTERMEDIATE: $(GENERATED)
+$(GENERATED) : %.lyx: %.lin Makefile
+       $(addversion) $< > $@
+
+.lyx.pdf:
+       @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n"
+
+.lyx.txt:
+       @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n"
+.lyx.html:
+       @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n"
+.fig.eps:
+       -fig2dev -L eps $< > $@
+
+portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx
+
+syncweb: portals3.pdf
+#      cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf
+#      ( cd /usr/src/www ; make lustre ; make synclustre )
+
diff --git a/lnet/doc/Message-life-cycle b/lnet/doc/Message-life-cycle
new file mode 100644 (file)
index 0000000..e8cc7e2
--- /dev/null
@@ -0,0 +1,118 @@
+This documents the life cycle of message as it arrives and is handled by
+a basic async, packetized NAL.  There are four types of messages that have
+slightly different life cycles, so they are addressed independently.
+
+
+Put request
+-----------
+
+1.  NAL notices that there is a incoming message header on the network
+and reads an ptl_hdr_t in from the wire.
+
+2.  It may store additional NAL specific data that provides context
+for this event in a void* that it will interpret in some fashion
+later.
+
+3.  The NAL calls lib_parse() with a pointer to the header and its
+private data structure.
+
+4.  The library decodes the header and may build a message state
+object that describes the event to be written and the ACK to be
+sent, if any.  It then calls nal->recv() with the private data
+that the NAL passed in, a pointer to the message state object
+and a translated user address.
+
+       The NAL will have been given a chance to pretranslate
+       all user addresses when the buffers are created.  This
+       process is described in the NAL-HOWTO.
+
+5.  The NAL should restore what ever context it required from the
+private data pointer, begin receiving the bytes and possibly store
+some extra state of its own.  It should return at this point.
+
+
+
+Get request
+-----------
+
+1.  As with a Put, the NAL notices the incoming message header and
+passes it to lib_parse().
+
+2.  The library decodes the header and calls nal->recv() with a
+zero byte length, offset and destination to instruct it to clean
+up the wire after reading the header.  The private data will
+be passed in as well, allowing the NAL to retrieve any state
+or context that it requires.
+
+3.  The library may build a message state object to possibly
+write an event log or invalidate a memory region.
+
+4.  The library will build a ptl_msg_t header that specifies the
+Portals protocol information for delivery at the remote end.
+
+5.  The library calls nal->send() with the pre-built header,
+the optional message state object, the four part address
+component, a translated user pointer + offset, and some
+other things.
+
+6.  The NAL is to put the header on the wire or copy it at
+this point (since it off the stack).  It should store some
+amount of state about its current position in the message and
+the destination address.
+
+7.  And then return to the library.
+
+
+Reply request
+-------------
+
+1.  Starting at "The library decodes the header..."
+
+2.  The library decodes the header and calls nal->recv()
+to bring in the rest of the message.  Flow continues in
+exactly the same fashion as with all other receives.
+
+
+Ack request
+-----------
+
+1.  The library decodes the header, builds the appropriate data
+structures for the event in a message state object and calls nal->recv()
+with a zero byte length, etc.
+
+
+Packet arrival
+--------------
+
+1.  The NAL should notice the arrival of a packet, retrieve whatever
+state it needs from the message ID or other NAL specific header data
+and place the data bytes directly into the user address that were
+given to nal->recv().
+
+       How this happens is outside the scope of the Portals library
+       and soley determined by the NAL...
+
+2.  If this is the last packet in a message, the NAL should retrieve
+the lib_msg_t *cookie that it was given in the call to nal->recv()
+and pass it to lib_finalize().  lib_finalize() may call nal->send()
+to send an ACK, nal->write() to record an entry in the event log,
+nal->invalidate() to unregister a region of memory or do nothing at all.
+
+3.  It should then clean up any remaining NAL specific state about
+the message and go back into the main loop.
+
+
+Outgoing packets
+----------------
+
+1.  When the NAL has pending output, it should put the packets on
+the wire wrapped with whatever implementation specified wrappers.
+
+2.  Once it has output all the packets of a message it should
+call lib_finalize() with the message state object that was
+handed to nal->send().  This will allows the library to clean
+up its state regarding the message and write any pending event
+entries.
+
+
+
diff --git a/lnet/doc/NAL-HOWTO b/lnet/doc/NAL-HOWTO
new file mode 100644 (file)
index 0000000..ea38aed
--- /dev/null
@@ -0,0 +1,293 @@
+This document is a first attempt at describing how to write a NAL
+for the Portals 3 library.  It also defines the library architecture
+and the abstraction of protection domains.
+
+
+First, an overview of the architecture:
+
+    Application
+
+----|----+--------
+         |
+   API  === NAL        (User space)
+         |   
+---------+---|-----
+         |    
+   LIB  === NAL        (Library space)
+         |
+---------+---|-----
+          
+    Physical wire      (NIC space)
+          
+
+Application
+    API
+API-side NAL
+------------
+LIB-side NAL
+    LIB
+LIB-side NAL
+   wire
+
+Communication is through the indicated paths via well defined
+interfaces.  The API and LIB portions are written to be portable
+across platforms and do not depend on the network interface.
+
+Communcation between the application and the API code is
+defined in the Portals 3 API specification.  This is the
+user-visible portion of the interface and should be the most
+stable.
+
+
+
+API-side NAL:
+------------
+
+The user space NAL needs to implement only a few functions
+that are stored in a nal_t data structure and called by the
+API-side library:
+
+       int forward( nal_t *nal,
+               int     index,
+               void    *args,
+               size_t  arg_len,
+               void    *ret,
+               size_t  ret_len
+       );
+
+Most of the data structures in the portals library are held in
+the LIB section of the code, so it is necessary to forward API
+calls across the protection domain to the library.  This is
+handled by the NAL's forward method.  Once the argument and return
+blocks are on the remote side the NAL should call lib_dispatch()
+to invoke the appropriate API function.
+
+       int validate( nal_t *nal,
+               void    *base,
+               size_t  extent,
+               void    **trans_base,
+               void    **trans_data
+       );
+
+The validate method provides a means for the NAL to prevalidate
+and possibly pretranslate user addresses into a form suitable
+for fast use by the network card or kernel module.  The trans_base
+pointer will be used by the library everytime it needs to
+refer to the block of memory.  The trans_data result is a
+cookie that will be handed to the NAL along with the trans_base.
+
+The library never performs calculations on the trans_base value;
+it only computes offsets that are then handed to the NAL.
+
+
+       int shutdown( nal_t *nal, int interface );
+
+Brings down the network interface.  The remote NAL side should
+call lib_fini() to bring down the library side of the network.
+
+       void yield( nal_t *nal );
+
+This allows the user application to gracefully give up the processor
+while busy waiting.  Performance critical applications may not
+want to take the time to call this function, so it should be an
+option to the PtlEQWait call.  Right now it is not implemented as such.
+
+Lastly, the NAL must implement a function named PTL_IFACE_*, where
+* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR.
+This initialization function is to set up communication with the
+library-side NAL, which should call lib_init() to bring up the
+network interface.
+
+
+
+LIB-side NAL:
+------------
+
+On the library-side, the NAL has much more responsibility.  It
+is responsible for calling lib_dispatch() on behalf of the user,
+it is also responsible for bringing packets off the wire and
+pushing bits out.  As on the user side, the methods are stored
+in a nal_cb_t structure that is defined on a per network
+interface basis.
+
+The calls to lib_dispatch() need to be examined.  The prototype:
+
+       void    lib_dispatch(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       int                     index,
+                       void                    *arg_block,
+                       void                    *ret_block
+       );
+
+has two complications.  The private field is a NAL-specific
+value that will be passed to any callbacks produced as a result
+of this API call.  Kernel module implementations may use this
+for task structures, or perhaps network card data.  It is ignored
+by the library.
+
+Secondly, the arg_block and ret_block must be in the same protection
+domain as the library.  The NAL's two halves must communicate the
+sizes and perform the copies.  After the call, the buffer pointed
+to by ret_block will be filled in and should be copied back to
+the user space.  How this is to be done is NAL specific.
+
+       int lib_parse(
+                       nal_cb_t                *nal,
+                       ptl_hdr_t               *hdr,
+                       void                    *private
+       );
+
+This is the only other entry point into the library from the NAL.
+When the NAL detects an incoming message on the wire it should read
+sizeof(ptl_hdr_t) bytes and pass a pointer to the header to
+lib_parse().  It may set private to be anything that it needs to
+tie the incoming message to callbacks that are made as a result
+of this event.
+
+The method calls are:
+
+       int     (*send)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       ptl_hdr_t               *hdr,
+                       int                     nid,
+                       int                     pid,
+                       int                     gid,
+                       int                     rid,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  len
+       );
+
+This is a tricky function -- it must support async output
+of messages as well as properly syncronized event log writing.
+The private field is the same that was passed into lib_dispatch()
+or lib_parse() and may be used to tie this call to the event
+that initiated the entry to the library.
+
+The cookie is a pointer to a library private value that must
+be passed to lib_finalize() once the message has been completely
+sent.  It should not be examined by the NAL for any meaning.
+
+The four ID fields are passed in, although some implementations
+may not use all of them.
+
+The single base pointer has been replaced with the translated
+address that the API NAL generated in the api_nal->validate()
+call.  The trans_data is unchanged and the offset is in bytes.
+
+
+       int     (*recv)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  mlen,
+                       size_t                  rlen
+       );
+
+This callback will only be called in response to lib_parse().
+The cookie, trans_addr and trans_data  are as discussed in send().
+The NAL should read mlen bytes from the wire, deposit them into
+trans_base + offset and then discard (rlen - mlen) bytes.
+Once the entire message has been received the NAL should call
+lib_finalize() with the lib_msg_t *cookie.
+
+The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0
+is used to indicate that the NAL should clean up the wire.  This could
+be implemented as a blocking call, although having it return as quickly
+as possible is desirable.
+
+       int     (*write)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       user_ptr                trans_addr,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+
+                       void                    *src_addr,
+                       size_t                  len
+       );
+
+This is essentially a cross-protection domain memcpy().  The user address
+has been pretranslated by the api_nal->translate() call.
+
+       void    *(*malloc)(
+                       nal_cb_t                *nal,
+                       size_t                  len
+       );
+
+       void    (*free)(
+                       nal_cb_t                *nal,
+                       void                    *buf
+       );
+
+Since the NAL may be in a non-standard hosted environment it can
+not call malloc().  This allows the library side NAL to implement
+the system specific malloc().  In the current reference implementation
+the libary only calls nal->malloc() when the network interface is
+initialized and then calls free when it is brought down.  The library
+maintains its own pool of objects for allocation so only one call to
+malloc is made per object type.
+
+       void    (*invalidate)(
+                       nal_cb_t                *nal,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  extent
+       );
+
+User addresses are validated/translated at the user-level API NAL
+method, which is likely to push them to this level.  Meanwhile,
+the library NAL will be notified when the library no longer
+needs the buffer.  Overlapped buffers are not detected by the
+library, so the NAL should ref count each page involved.
+
+Unfortunately we have a few bugs when the invalidate method is
+called.  It is still in progress...
+
+       void    (*printf)(
+                       nal_cb_t                *nal,
+                       const char              *fmt,
+                       ...
+       );
+
+As with malloc(), the library does not have any way to do printf
+or printk.  It is not necessary for the NAL to implement the this
+call, although it will make debugging difficult.
+
+       void    (*cli)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+       void    (*sti)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+These are used by the library to mark critical sections.
+
+       int     (*gidrid2nidpid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                gid,
+                       ptl_id_t                rid,
+                       ptl_id_t                *nid,
+                       ptl_id_t                *pid
+       );
+
+
+       int     (*nidpid2gidrid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                nid,
+                       ptl_id_t                pid,
+                       ptl_id_t                *gid,
+                       ptl_id_t                *rid
+       );
+
+Rolf added these.  I haven't looked at how they have to work yet.
diff --git a/lnet/doc/file.fig b/lnet/doc/file.fig
new file mode 100644 (file)
index 0000000..914c294
--- /dev/null
@@ -0,0 +1,111 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1200 750 1650 1050
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1050 1650 750 1200 750 1200 1050 1650 1050
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001
+-6
+6 1200 2325 1650 2625
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2625 1650 2325 1200 2325 1200 2625 1650 2625
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001
+-6
+6 1200 1800 1650 2100
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2100 1650 1800 1200 1800 1200 2100 1650 2100
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001
+-6
+6 1200 1275 1650 1575
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1575 1650 1275 1200 1275 1200 1575 1650 1575
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001
+-6
+6 450 750 900 1200
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 825 450 1050
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1050 900 825
+-6
+6 450 2325 900 2775
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 2400 450 2625
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2625 900 2400
+-6
+6 450 1800 900 2250
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1875 450 2100
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2100 900 1875
+-6
+6 450 1275 900 1725
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1350 450 1575
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1575 900 1350
+-6
+6 2250 750 3450 2625
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1200 3150 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1500 3150 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1800 3150 1800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2100 3150 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 975 3150 975 3150 2625 2550 2625 2550 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2400 3150 2400
+4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2400 2550 1350
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1875 2550 1050
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1425 2550 1950
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 900 2550 1650
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 900 1200 900
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1425 1200 1425
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1950 1200 1950
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2475 1200 2475
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2025 2550 2250
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2550 2550 2475
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1875 2850 1875 600 225 600 225 2850 1875 2850
+4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001
diff --git a/lnet/doc/flow_new.fig b/lnet/doc/flow_new.fig
new file mode 100644 (file)
index 0000000..d828dea
--- /dev/null
@@ -0,0 +1,213 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 525 2175 1575 2925
+6 675 2287 1425 2812
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001
+4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 2550 1050 2175 525 2550 1050 2925 1575 2550
+-6
+6 3450 1275 4350 1725
+6 3600 1312 4200 1687
+4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001
+4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3450 1275 4350 1275 4350 1725 3450 1725 3450 1275
+-6
+6 4650 1275 5550 1725
+6 4725 1312 5475 1687
+4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001
+4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4650 1275 5550 1275 5550 1725 4650 1725 4650 1275
+-6
+6 1350 525 2250 975
+6 1350 562 2250 937
+4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001
+4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 525 2250 525 2250 975 1350 975 1350 525
+-6
+6 525 1125 1575 1875
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 1500 1050 1125 525 1500 1050 1875 1575 1500
+4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001
+-6
+6 2340 1237 2940 1687
+6 2340 1237 2940 1687
+4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001
+4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001
+4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001
+-6
+-6
+6 525 3225 1575 3975
+6 675 3375 1425 3750
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        525 3600 1050 3225 1575 3600 1050 3975 525 3600
+-6
+6 3300 3375 4350 3825
+6 3300 3412 4350 3787
+4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3300 3375 4350 3375 4350 3825 3300 3825 3300 3375
+-6
+6 1950 3225 3000 3975
+6 2250 3450 2700 3750
+4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        3000 3600 2475 3225 1950 3600 2475 3975 3000 3600
+-6
+6 3150 4500 4200 4950
+6 3150 4537 4200 4912
+4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3150 4500 4200 4500 4200 4950 3150 4950 3150 4500
+-6
+6 600 4500 1500 4950
+6 675 4537 1425 4912
+4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001
+4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        600 4500 1500 4500 1500 4950 600 4950 600 4500
+-6
+6 4650 4350 5700 5100
+6 4950 4537 5400 4912
+6 4950 4537 5400 4912
+4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001
+4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001
+-6
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        5700 4725 5175 4350 4650 4725 5175 5100 5700 4725
+-6
+6 6000 4500 6900 4950
+6 6225 4575 6675 4875
+4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001
+4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        6000 4500 6900 4500 6900 4950 6000 4950 6000 4500
+-6
+6 1800 4350 2850 5100
+6 2100 4575 2550 4875
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        2850 4725 2325 4350 1800 4725 2325 5100 2850 4725
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 1875 1050 2175
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 1500 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 450 1050 1125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1350 750 1050 750
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 2925 1050 3225
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3150 1500 3450 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4350 1500 4650 1500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        2100 1500 2625 1125 3150 1500 2625 1875 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 3600 1950 3600
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 3975 1050 4500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 3600 3300 3600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 4725 1800 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        5700 4725 6000 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2850 4725 3150 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4200 4725 4650 4725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        6900 4725 7950 4725
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1575 2550 1650 2550 1800 2550 1800 2400 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        2250 750 2475 750 2625 750 2625 900 2625 1125
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        7500 4725 7500 1650 7500 1500 7350 1500 5550 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        2475 3225 2475 2400 2475 2250 2325 2250 1800 2250
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        3825 3375 3825 2175 3825 2025 3675 2025 1800 2025
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125
+        4425 4275 4425 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125
+        7275 4275 7275 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001
+4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001
diff --git a/lnet/doc/get.fig b/lnet/doc/get.fig
new file mode 100644 (file)
index 0000000..28db949
--- /dev/null
@@ -0,0 +1,33 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 2775 900 3525 1200
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001
+-6
+6 1350 1725 2175 2025
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 750
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 825 2700 1275
+2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1350 900 1950
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
diff --git a/lnet/doc/ieee.bst b/lnet/doc/ieee.bst
new file mode 100644 (file)
index 0000000..5367caa
--- /dev/null
@@ -0,0 +1,1114 @@
+% ---------------------------------------------------------------
+%
+% $Id: ieee.bst,v 1.1.2.1 2003/05/19 04:25:30 braam Exp $
+%
+% by Paolo.Ienne@di.epfl.ch
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to 
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+%
+% `ieee' from BibTeX standard bibliography style `abbrv'
+% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+% The file btxbst.doc has the documentation for this style.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+ { add.period$ write$
+   newline$
+   "\newblock " write$
+ }
+ { output.state before.all =
+     'write$
+     { add.period$ " " * write$ }
+   if$
+ }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+ { namesleft #1 >
+     { ", " * t * }
+     { numnames #2 >
+  { "," * }
+  'skip$
+       if$
+       t "others" =
+  { " et~al." * }
+  { " and " * t * }
+       if$
+     }
+   if$
+ }
+ 't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+ { ", editors" * }
+ { ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+     { "--" *
+       t #2 global.max$ substring$ 't :=
+     }
+     {   { t #1 #1 substring$ "-" = }
+  { "-" *
+    t #2 global.max$ substring$ 't :=
+  }
+       while$
+     }
+   if$
+ }
+ { t #1 #1 substring$ *
+   t #2 global.max$ substring$ 't :=
+ }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+ { "" }
+ { "there's a month but no year in " cite$ * warning$
+   month
+ }
+      if$
+    }
+    { month empty$
+ 'year
+ { month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+ 'skip$
+ { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+     { "number" }
+     { "Number" }
+   if$
+   number tie.or.space.connect
+   series empty$
+     { "there's a number but no series in " cite$ * warning$ }
+     { " in " * series * }
+   if$
+ }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+ { "pages" pages n.dashify tie.or.space.connect }
+ { "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+ { pop$ format.pages }
+ { ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+ 'skip$
+ { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+   warning$
+   ""
+ }
+ { "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+     { " et~al." * }
+     { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+   if$
+ }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { series empty$
+     { "need editor, key, or series for " cite$ * " to crossref " *
+       crossref * warning$
+       "" *
+     }
+     { "{\em " * series * "\/}" * }
+   if$
+ }
+ { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { booktitle empty$
+     { "need editor, key, or booktitle for " cite$ * " to crossref " *
+       crossref * warning$
+       ""
+     }
+     { "In {\em " booktitle * "\/}" * }
+   if$
+ }
+ { "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+ { organization publisher new.sentence.checkb
+   organization output
+   publisher output
+   format.date "year" output.check
+ }
+ { address output.nonnull
+   format.date "year" output.check
+   new.sentence
+   organization output
+   publisher output
+ }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+ 'skip$
+ { organization output.nonnull
+   address output
+ }
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+ { address new.block.checka
+   address output
+ }
+ 'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+ { publisher new.sentence.checka }
+ { organization publisher new.sentence.checkb
+   organization output
+ }
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+ 'skip$
+ { organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"Mar."}
+
+MACRO {apr} {"Apr."}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+MACRO {acmcs} {"ACM Comput. Surv."}
+
+MACRO {acta} {"Acta Inf."}
+
+MACRO {cacm} {"Commun. ACM"}
+
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+
+MACRO {ibmsj} {"IBM Syst.~J."}
+
+MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
+
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
+
+MACRO {ipl} {"Inf. Process. Lett."}
+
+MACRO {jacm} {"J.~ACM"}
+
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+
+MACRO {scp} {"Sci. Comput. Programming"}
+
+MACRO {sicomp} {"SIAM J. Comput."}
+
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+
+MACRO {tods} {"ACM Trans. Database Syst."}
+
+MACRO {tog} {"ACM Trans. Gr."}
+
+MACRO {toms} {"ACM Trans. Math. Softw."}
+
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+
+MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
+
+MACRO {tcs} {"Theoretical Comput. Sci."}
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+ { "   " * }
+ 'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+ { "et al" * }
+ { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+   ""
+ }
+ { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+ { key empty$
+     { "to sort, need author, editor, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need author, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need editor, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+     'author.organization.sort
+     'author.sort
+   if$
+ }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label }
+
+INTEGERS { number.label longest.label.width }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {longest.label.pass}
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * 
+  "}\setlength{\itemsep}{-1ex}\small" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
+
+% end of file ieee.bst
+% ---------------------------------------------------------------
diff --git a/lnet/doc/mpi.fig b/lnet/doc/mpi.fig
new file mode 100644 (file)
index 0000000..e1a91b5
--- /dev/null
@@ -0,0 +1,117 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 150 1650 900 2025
+4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001
+4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001
+-6
+6 150 150 900 525
+4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001
+4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001
+-6
+6 2550 4125 3150 4725
+4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001
+4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001
+4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001
+-6
+6 1050 1575 1950 1875
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 1575 1950 1575 1950 1875 1050 1875 1050 1575
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001
+-6
+6 5400 1575 6300 2175
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 1575 6300 1575 6300 2175 5400 2175 5400 1575
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001
+-6
+6 5400 2400 6300 3000
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 2400 6300 2400 6300 3000 5400 3000 5400 2400
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001
+-6
+6 1050 2400 1950 2700
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 2400 1950 2400 1950 2700 1050 2700 1050 2400
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001
+-6
+6 1050 825 1950 1125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 825 1950 825 1950 1125 1050 1125 1050 825
+4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1575
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2025 4050 3375
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 675 6600 675
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 1350 6600 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 4125 3300 4125 3300 4725 2400 4725 2400 4125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 4500 4050 3675
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 1725 5400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2550 5400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2850 4050 3450
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1800 1500 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 825 3300 825 3300 1275 2400 1275 2400 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 2625 1500 4125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 4125 1950 4125 1950 4425 1050 4425 1050 4125
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 300 1500 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 975 2400 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 1725 2400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 2550 2400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 4275 2400 4275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 1575 3300 1575 3300 2175 2400 2175 2400 1575
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 2400 3300 2400 3300 3000 2400 3000 2400 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4050 3300 5250 3300 5250 3750 4050 3750 4050 3300
+4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001
+4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001
+4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001
+4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001
+4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001
+4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001
+4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001
diff --git a/lnet/doc/portals.fig b/lnet/doc/portals.fig
new file mode 100644 (file)
index 0000000..9b1271b
--- /dev/null
@@ -0,0 +1,68 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 900 1650 900 1650 1200 1350 1200 1350 900
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1800 1350 2100 1350 2100 1650 1800 1650 1800 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2250 1800 2550 1800 2550 2100 2250 2100 2250 1800
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        4200 375 4200 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        525 600 1125 600 1125 2100 525 2100 525 600
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4425 1275 4875 1275 4875 1950 4425 1950 4425 1275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 1200 3150 1200 3150 1500 2550 1500 2550 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 1425 4425 1425
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3600 825 3750 825 3750 1125 3600 1125 3600 825
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2025 1425 2550 1425
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        4425 750 4875 750 4875 1125 4425 1125 4425 750
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3675 975 4425 975
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2
+       0 0 1.00 60.00 120.00
+        825 1050 1350 1050
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1350 1500 1500 1650 1500 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1950 1575 1950 1800 1950 1950 2100 1950 2250 1950
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 975 1125 975
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 1125 1125 1125
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7
+       0 0 1.00 60.00 120.00
+        3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975
+        3600 975
+        0.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001
+4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001
+4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001
+4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001
+4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001
+4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001
+4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001
diff --git a/lnet/doc/portals3.bib b/lnet/doc/portals3.bib
new file mode 100644 (file)
index 0000000..323b99f
--- /dev/null
@@ -0,0 +1,124 @@
+@Article{           Cplant,
+    title       = { {M}assively {P}arallel {C}omputing with
+                    {C}ommodity {C}omponents },
+    author      = { Ron Brightwell and David S. Greenberg and Arthur
+                    B. Maccabe and Rolf Riesen },
+    journal     = { Parallel Computing },
+    volume      = { 26 },
+    month       = { February },
+    pages       = { 243-266 },
+    year        = { 2000 }
+}
+
+@Manual{     Portals,
+    organization = { Sandia National Laboratories },
+    title        = { {P}uma {P}ortals },
+    note         = { http://www.cs.sandia.gov/puma/portals },
+    year         = { 1997 }
+}
+
+@Techreport{      VIA,
+  title         = { {V}irtual {I}nterface {A}rchitecture
+                    {S}pecification {V}ersion 1.0 }, 
+  author        = { {Compaq, Microsoft, and Intel} },
+  institution   = { Compaq, Microsoft, and Intel },
+  month         = { December },
+  year          = { 1997 }
+}
+
+@Techreport{      ST,
+  title         = { {I}nformation {T}echnology - {S}cheduled
+                  {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 },
+  author        = { {Task Group of Technical Committee T11} },
+  institution   = { Accredited Standards Committee NCITS },
+  month         = { July },
+  year          = { 1998 }
+}
+
+@Manual{     TFLOPS,
+    organization = { Sandia National Laboratories },
+    title        = { ASCI Red },
+    note         = { http://www.sandia.gov/ASCI/TFLOP },
+    year         = { 1996 }
+}
+
+@Techreport{      GM,
+  title         = { The {GM} {M}essage {P}assing {S}ystem },
+  author         = { {Myricom, Inc.} },
+  institution    = { {Myricom, Inc.} },
+  year          = { 1997 },
+}
+
+@Article{           MPIstandard,
+    title        = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard },
+    author       = { {Message Passing Interface Forum} },
+    journal      = { The International Journal of Supercomputer Applications
+                     and High Performance Computing },
+    volume       = { 8 },
+    year         = { 1994 }
+}
+
+@Inproceedings{    PumaOS,
+    author       = "Lance Shuler and Chu Jong and Rolf Riesen and
+                    David van Dresser and Arthur B. Maccabe and
+                    Lee Ann Fisk and T. Mack Stallcup",
+    booktitle    = "Proceeding of the 1995 Intel Supercomputer
+                    User's Group Conference",
+    title        = "The {P}uma Operating System for Massively Parallel Computers",
+    organization = "Intel Supercomputer User's Group",
+    year         = 1995
+}
+
+@InProceedings{   SUNMOS,
+author          = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and
+                   Stephen R. Wheat",
+title           = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide",
+booktitle       = "Proceedings of the {Intel} Supercomputer Users' Group. 1994
+                   Annual North America Users' Conference.",
+year            = 1994,
+pages           = "245--251",
+month           = "June",
+location        = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps"
+}
+
+@InProceedings {   PumaMPI,
+    title        = { Design and Implementation of {MPI} on {P}uma Portals },
+    author       = { Ron Brightwell and Lance Shuler },
+    booktitle    = { Proceedings of the Second MPI Developer's Conference },
+    pages        = { 18-25 },
+    month        = { July },
+    year         = { 1996 }
+}
+
+@Inproceedings{     FM2,
+    author       = { Mario Lauria and Scott Pakin and Andrew Chien },
+    title        = { {E}fficient {L}ayering for {H}igh {S}peed
+                     {C}ommunication: {F}ast {M}essages 2.x },
+    Booktitle    = { Proceedings of the IEEE International Symposium
+                     on High Performance Distributed Computing },
+    year         = { 1998 }
+}
+
+@Manual {          CraySHMEM,
+    title        = "SHMEM Technical Note for C, SG-2516 2.3",
+    organization = "Cray Research, Inc.",
+    month        = "October",
+    year         = 1994
+}
+
+@Manual {          MPI2,
+    title        = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface",
+    organization = "Message Passing Interface Forum",
+    note         = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html",
+    month        = "July",
+    year         = 1997
+}
+
+@InProceedings {   PMMPI,
+    title        = { {The Design and Implementation of Zero Copy MPI Using
+                       Commodity Hardware with a High Performance Network} },
+    author       = { Francis O'Carroll and  Hiroshi Tezuka and Atsushi Hori
+                     and Yutaka Ishikawa  },
+    booktitle    = { Proceedings of the ICS },
+    year         = { 1998 }
+}
diff --git a/lnet/doc/portals3.lyx b/lnet/doc/portals3.lyx
new file mode 100644 (file)
index 0000000..f3c24e0
--- /dev/null
@@ -0,0 +1,15946 @@
+#LyX 1.2 created this file. For more info see http://www.lyx.org/
+\lyxformat 220
+\textclass report
+\begin_preamble
+\usepackage{fullpage}
+\renewenvironment{comment}%
+{\begin{quote}\textbf{Discussion}: \slshape}%
+{\end{quote}}
+\pagestyle{myheadings}
+\markboth{$Revision: 1.1.2.1 $\hfil$Date: 2003/05/19 04:25:30 $}%
+{$Date: 2003/05/19 04:25:30 $\hfil$Revision: 1.1.2.1 $}
+\end_preamble
+\language american
+\inputencoding auto
+\fontscheme pslatex
+\graphics default
+\paperfontsize 10
+\spacing single 
+\papersize letterpaper
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 2
+\tocdepth 2
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 2
+\paperpagestyle headings
+
+\layout Title
+
+The Portals 3.2 Message Passing Interface 
+\newline 
+ Revision 1.1
+\layout Author
+
+Ron Brightwell
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+R.
+ Brightwell and R.
+ Riesen are with the Scalable Computing Systems Department, Sandia National
+ Laboratories, P.O.
+ Box 5800, Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov.
+\end_inset 
+
+, Arthur B.
+ Maccabe
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+A.
+ B.
+ Maccabe is with the Computer Science Department, University of New Mexico,
+ Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87131-1386, maccabe@cs.unm.edu.
+\end_inset 
+
+, Rolf Riesen and Trammell Hudson
+\layout Abstract
+
+This report presents a specification for the Portals 3.2 message passing
+ interface.
+ Portals 3.2 is intended to allow scalable, high-performance network communicatio
+n between nodes of a parallel computing system.
+ Specifically, it is designed to support a parallel computing platform composed
+ of clusters of commodity workstations connected by a commodity system area
+ network fabric.
+ In addition, Portals 3.2 is well suited to massively parallel processing
+ and embedded systems.
+ Portals 3.2 represents an adaption of the data movement layer developed
+ for massively parallel processing platforms, such as the 4500-node Intel
+ TeraFLOPS machine.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+clearpage
+\backslash 
+pagenumbering{roman}
+\backslash 
+setcounter{page}{3}
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset LatexCommand \tableofcontents{}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList figure
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList table
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Chapter*
+
+Summary of Changes for Revision 1.1
+\layout Enumerate
+
+Updated version number to 3.2 throughout the document
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sub:PtlGetId}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_SEGV
+\family default 
+ to error list for 
+\shape italic 
+PtlGetId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_ML_TOOLONG
+\family default 
+ to error list for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meunlink}
+
+\end_inset 
+
+: removed text referring to a list of associated memory descriptors.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added text to describe unlinking a free-floating memory descriptor.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added entry for 
+\family typewriter 
+ptl_seq_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+:
+\begin_deeper 
+\layout Enumerate
+
+added definition of 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+added text to clarify 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: modified text for 
+\family typewriter 
+unlink_op
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: added text to clarify multiple calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: added text to clarify 
+\family typewriter 
+unlink_nofit
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:receiving}
+
+\end_inset 
+
+: removed text indicating that an MD will reject a message if the associated
+ EQ is full.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ error code and text to indicate that only MDs with no pending operations
+ can be unlinked.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ return code.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added user id field, MD handle field, and NI specific failure field to
+ the 
+\family typewriter 
+ptl_event_t
+\family default 
+ structure.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_EVENT_UNLINK
+\family default 
+ event type.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: removed 
+\shape slanted 
+PtlTransId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+: listed allowable constants with relevant fields.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: added 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ function.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_PT_FULL
+\family default 
+ return code for 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+: updated to reflect new event types.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_nid_t
+\family default 
+, 
+\family typewriter 
+ptl_pid_t
+\family default 
+, and 
+\family typewriter 
+ptl_uid_t
+\family default 
+.
+\layout Chapter*
+
+Summary of Changes for Version 3.1
+\layout Section*
+
+Thread Issues
+\layout Standard
+
+The most significant change to the interface from version 3.0 to 3.1 involves
+ the clarification of how the interface interacts with multi-threaded applicatio
+ns.
+ We adopted a generic thread model in which processes define an address
+ space and threads share the address space.
+ Consideration of the API in the light of threads lead to several clarifications
+ throughout the document: 
+\layout Enumerate
+
+Glossary: 
+\begin_deeper 
+\layout Enumerate
+
+added a definition for 
+\emph on 
+thread
+\emph default 
+, 
+\layout Enumerate
+
+reworded the definition for 
+\emph on 
+process
+\emph default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+: added section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:threads}
+
+\end_inset 
+
+ to describe the multi-threading model used by the Portals API.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlfini}
+
+\end_inset 
+
+: 
+\emph on 
+PtlFini
+\emph default 
+ should be called once as the process is terminating and not as each thread
+ terminates.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+: Portals does not define thread ids.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+: network interfaces are associated with processes, not threads.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlNIInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqget}
+
+\end_inset 
+
+: 
+\emph on 
+PtlEQGet
+\emph default 
+ returns 
+\family typewriter 
+PTL_EQ_EMPTY
+\family default 
+ if a thread is blocked on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqwait}
+
+\end_inset 
+
+: waiting threads are awakened in FIFO order.
+\layout Standard
+
+Two functions, 
+\emph on 
+PtlNIBarrier
+\emph default 
+ and 
+\emph on 
+PtlEQCount
+\emph default 
+ were removed from the API.
+\emph on 
+PtlNIBarrier
+\emph default 
+ was defined to block the calling process until all of the processes in
+ the application group had invoked 
+\emph on 
+PtlNIBarrier
+\emph default 
+.
+ We now consider this functionality, along with the concept of groups (see
+ the discussion under 
+\begin_inset Quotes eld
+\end_inset 
+
+other changes
+\begin_inset Quotes erd
+\end_inset 
+
+), to be part of the runtime system, not part of the Portals API.
+\emph on 
+PtlEQCount
+\emph default 
+ was defined to return the number of events in an event queue.
+ Because external operations may lead to new events being added and other
+ threads may remove events, the value returned by 
+\emph on 
+PtlEQCount
+\emph default 
+ would have to be a hint about the number of events in the event queue.
+\layout Section*
+
+Handling small, unexpected messages
+\layout Standard
+
+Another set of changes relates to handling small unexpected messages in
+ MPI.
+ In designing version 3.0, we assumed that each unexpected message would
+ be placed in a unique memory descriptor.
+ To avoid the need to process a long list of memory descriptors, we moved
+ the memory descriptors out of the match list and hung them off of a single
+ match list entry.
+ In this way, large unexpected messages would only encounter a single 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry before encountering the 
+\begin_inset Quotes eld
+\end_inset 
+
+long message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry.
+ Experience with this strategy identified resource management problems with
+ this approach.
+ In particular, a long sequence of very short (or zero length) messages
+ could quickly exhaust the memory descriptors constructed for handling unexpecte
+d messages.
+ Our new strategy involves the use of several very large memory descriptors
+ for small unexpected messages.
+ Consecutive unexpected messages will be written into the first of these
+ memory descriptors until the memory descriptor fills up.
+ When the first of the 
+\begin_inset Quotes eld
+\end_inset 
+
+small memory
+\begin_inset Quotes erd
+\end_inset 
+
+ descriptors fills up, it will be unlinked and subsequent short messages
+ will be written into the next 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor.
+ In this case, a 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor will be declared full when it does not have sufficient
+ space for the largest small unexpected message.
+\layout Standard
+
+This lead to two significant changes.
+ First, each match list entry now has a single memory descriptor rather
+ than a list of memory descriptors.
+ Second, in addition to exceeding the operation threshold, a memory descriptor
+ can be unlinked when the local offset exceeds a specified value.
+ These changes have lead to several changes in this document: 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{subsec:paddress}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed references to the memory descriptor list, 
+\layout Enumerate
+
+changed the portals address translation description to indicate that unlinking
+ a memory descriptor implies unlinking the associated match list entry--match
+ list entries can no longer be unlinked independently from the memory descriptor.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed unlink from argument list, 
+\layout Enumerate
+
+removed description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+changed wording of the error condition when the Portal table index already
+ has an associated match list.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+: removed unlink from argument list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+added description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+removed reference to memory descriptor lists, 
+\layout Enumerate
+
+changed wording of the error condition when match list entry already has
+ an associated memory descriptor, 
+\layout Enumerate
+
+changed the description of the 
+\family typewriter 
+unlink
+\family default 
+ argument.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+: removed 
+\family typewriter 
+PtlMDInsert
+\family default 
+ operation.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: removed references to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: removed references to PtlMDInsert.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+: revised the MPI example to reflect the changes to the interface.
+\layout Standard
+
+Several changes have been made to improve the general documentation of the
+ interface.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_ID_ANY
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: documented the return value 
+\family typewriter 
+PTL_INV_EQ
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+: clarified the description of the 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:implvals}
+
+\end_inset 
+
+: introduced a new section to document the implementation defined values.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: modified Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ to indicate where each constant is introduced and where it is used.
+\layout Section*
+
+Other changes
+\layout Subsection*
+
+Implementation defined limits (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version provided implementation defined limits for the maximum
+ number of match entries, the maximum number of memory descriptors, etc.
+ Rather than spanning the entire implementation, these limits are now associated
+ with individual network interfaces.
+\layout Subsection*
+
+Added User Ids (Section 
+\begin_inset LatexCommand \ref{sec:uid}
+
+\end_inset 
+
+)
+\layout Standard
+
+Group Ids had been used to simplify access control entries.
+ In particular, a process could allow access for all of the processes in
+ a group.
+ User Ids have been introduced to regain this functionality.
+ We use user ids to fill this role.
+\layout Subsection*
+
+Removed Group Ids and Rank Ids (Section 
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version of Portals had two forms for addressing processes: <node
+ id, process id> and <group id, rank id>.
+ A process group was defined as the collection processes created during
+ application launch.
+ Each process in the group was given a unique rank id in the range 0 to
+\begin_inset Formula $n-1$
+\end_inset 
+
+ where 
+\begin_inset Formula $n$
+\end_inset 
+
+ was the number of processes in the group.
+ We removed groups because they are better handled in the runtime system.
+\layout Subsection*
+
+Match lists (Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+)
+\layout Standard
+
+It is no longer illegal to have an existing match entry when calling PtlMEAttach.
+ A position argument was added to the list of arguments supplied to 
+\emph on 
+PtlMEAttach
+\emph default 
+ to specify whether the new match entry is prepended or appended to the
+ existing list.
+ If there is no existing match list, the position argument is ignored.
+\layout Subsection*
+
+Unlinking Memory Descriptors (Section 
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, a memory descriptor could be unlinked if the offset exceeded
+ a threshold upon the completion of an operation.
+ In this version, the unlinking is delayed until there is a matching operation
+ which requires more memory than is currently available in the descriptor.
+ In addition to changes in section, this lead to a revision of Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+.
+\layout Subsection*
+
+Split Phase Operations and Events (Section 
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, there were five types of events: 
+\family typewriter 
+PTL_EVENT_PUT
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+, and 
+\family typewriter 
+PTL_EVENT_ACK.
+\family default 
+The first four of these reflected the completion of potentially long operations.
+ We have introduced new event types to reflect the fact that long operations
+ have a distinct starting point and a distinct completion point.
+ Moreover, the completion may be successful or unsuccessful.
+\layout Standard
+
+In addition to providing a mechanism for reporting failure to higher levels
+ of software, this split provides an opportunity for for improved ordering
+ semantics.
+ Previously, if one process intiated two operations (e.g., two put operations)
+ on a remote process, these operations were guaranteed to complete in the
+ same order that they were initiated.
+ Now, we only guarantee that the initiation events are delivered in the
+ same order.
+ In particular, the operations do not need to complete in the order that
+ they were intiated.
+\layout Subsection*
+
+Well known proces ids (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+To support the notion of 
+\begin_inset Quotes eld
+\end_inset 
+
+well known process ids,
+\begin_inset Quotes erd
+\end_inset 
+
+ we added a process id argument to the arguments for PtlNIInit.
+\layout Chapter*
+
+Glossary
+\layout Description
+
+API Application Programming Interface.
+ A definition of the functions and semantics provided by library of functions.
+\layout Description
+
+Initiator A 
+\emph on 
+process
+\emph default 
+ that initiates a message operation.
+\layout Description
+
+Message An application-defined unit of data that is exchanged between 
+\emph on 
+processes
+\emph default 
+.
+\layout Description
+
+Message\SpecialChar ~
+Operation Either a put operation, which writes data, or a get operation,
+ which reads data.
+\layout Description
+
+Network A network provides point-to-point communication between 
+\emph on 
+nodes
+\emph default 
+.
+ Internally, a network may provide multiple routes between endpoints (to
+ improve fault tolerance or to improve performance characteristics); however,
+ multiple paths will not be exposed outside of the network.
+\layout Description
+
+Node A node is an endpoint in a 
+\emph on 
+network
+\emph default 
+.
+ Nodes provide processing capabilities and memory.
+ A node may provide multiple processors (an SMP node) or it may act as a
+\emph on 
+gateway
+\emph default 
+ between networks.
+\layout Description
+
+Process A context of execution.
+ A process defines a virtual memory (VM) context.
+ This context is not shared with other processes.
+ Several threads may share the VM context defined by a process.
+\layout Description
+
+Target A 
+\emph on 
+process
+\emph default 
+ that is acted upon by a message operation.
+\layout Description
+
+Thread A context of execution that shares a VM context with other threads.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\layout Standard
+
+\backslash 
+setcounter{page}{1}
+\backslash 
+pagenumbering{arabic}
+\end_inset 
+
+
+\layout Chapter
+
+Introduction
+\begin_inset LatexCommand \label{sec:intro}
+
+\end_inset 
+
+
+\layout Section
+
+Overview
+\layout Standard
+
+This document describes an application programming interface for message
+ passing between nodes in a system area network.
+ The goal of this interface is to improve the scalability and performance
+ of network communication by defining the functions and semantics of message
+ passing required for scaling a parallel computing system to ten thousand
+ nodes.
+ This goal is achieved by providing an interface that will allow a quality
+ implementation to take advantage of the inherently scalable design of Portals.
+\layout Standard
+
+This document is divided into several sections: 
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:intro}
+
+\end_inset 
+
+---Introduction This section describes the purpose and scope of the Portals
+ API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+---An\SpecialChar ~
+Overview\SpecialChar ~
+of\SpecialChar ~
+the\SpecialChar ~
+Portals\SpecialChar ~
+3.1\SpecialChar ~
+API This section gives a brief overview of the
+ Portals API.
+ The goal is to introduce the key concepts and terminology used in the descripti
+on of the API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:api}
+
+\end_inset 
+
+---The\SpecialChar ~
+Portals\SpecialChar ~
+3.2\SpecialChar ~
+API This section describes the functions and semantics of
+ the Portals application programming interface.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+--The\SpecialChar ~
+Semantics\SpecialChar ~
+of\SpecialChar ~
+Message\SpecialChar ~
+Transmission This section describes the semantics
+ of message transmission.
+ In particular, the information transmitted in each type of message and
+ the processing of incoming messages.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:examples}
+
+\end_inset 
+
+---Examples This section presents several examples intended to illustrates
+ the use of the Portals API.
+\layout Section
+
+Purpose
+\layout Standard
+
+Existing message passing technologies available for commodity cluster networking
+ hardware do not meet the scalability goals required by the Cplant\SpecialChar ~
+
+\begin_inset LatexCommand \cite{Cplant}
+
+\end_inset 
+
+ project at Sandia National Laboratories.
+ The goal of the Cplant project is to construct a commodity cluster that
+ can scale to the order of ten thousand nodes.
+ This number greatly exceeds the capacity for which existing message passing
+ technologies have been designed and implemented.
+\layout Standard
+
+In addition to the scalability requirements of the network, these technologies
+ must also be able to support a scalable implementation of the Message Passing
+ Interface (MPI)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPIstandard}
+
+\end_inset 
+
+ standard, which has become the 
+\shape italic 
+de facto
+\shape default 
+ standard for parallel scientific computing.
+ While MPI does not impose any scalability limitations, existing message
+ passing technologies do not provide the functionality needed to allow implement
+ations of MPI to meet the scalability requirements of Cplant.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ any inherent scalability limitations: 
+\layout Itemize
+
+Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+ and TCP/IP sockets, have limitations on the number of peer connections
+ that can be established.
+\layout Itemize
+
+Network independence - Many communication systems depend on the host processor
+ to perform operations in order for messages in the network to be consumed.
+ Message consumption from the network should not be dependent on host processor
+ activity, such as the operating system scheduler or user-level thread scheduler.
+\layout Itemize
+
+User-level flow control - Many communication systems manage flow control
+ internally to avoid depleting resources, which can significantly impact
+ performance as the number of communicating processes increases.
+\layout Itemize
+
+OS Bypass - High performance network communication should not involve memory
+ copies into or out of a kernel-managed protocol stack.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ scalability limitations for an implementation of MPI:
+\layout Itemize
+
+Receiver-managed - Sender-managed message passing implementations require
+ a persistent block of memory to be available for every process, requiring
+ memory resources to increase with job size and requiring user-level flow
+ control mechanisms to manage these resources.
+\layout Itemize
+
+User-level Bypass - While OS Bypass is necessary for high-performance, it
+ alone is not sufficient to support the Progress Rule of MPI asynchronous
+ operations.
+\layout Itemize
+
+Unexpected messages - Few communication systems have support for receiving
+ messages for which there is no prior notification.
+ Support for these types of messages is necessary to avoid flow control
+ and protocol overhead.
+\layout Section
+
+Background
+\layout Standard
+
+Portals was originally designed for and implemented on the nCube machine
+ as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{SUNMOS}
+
+\end_inset 
+
+ and Puma\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaOS}
+
+\end_inset 
+
+ lightweight kernel development projects.
+ Portals went through two design phases, the latter of which is used on
+ the 4500-node Intel TeraFLOPS machine\SpecialChar ~
+
+\begin_inset LatexCommand \cite{TFLOPS}
+
+\end_inset 
+
+.
+ Portals have been very successful in meeting the needs of such a large
+ machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaMPI}
+
+\end_inset 
+
+, but also for implementing the scalable run-time environment and parallel
+ I/O capabilities of the machine.
+\layout Standard
+
+The second generation Portals implementation was designed to take full advantage
+ of the hardware architecture of large MPP machines.
+ However, efforts to implement this same design on commodity cluster technology
+ identified several limitations, due to the differences in network hardware
+ as well as to shortcomings in the design of Portals.
+\layout Section
+
+Scalability
+\layout Standard
+
+The primary goal in the design of Portals is scalability.
+ Portals are designed specifically for an implementation capable of supporting
+ a parallel job running on tens of thousands of nodes.
+ Performance is critical only in terms of scalability.
+ That is, the level of message passing performance is characterized by how
+ far it allows an application to scale and not by how it performs in micro-bench
+marks (e.g., a two node bandwidth or latency test).
+\layout Standard
+
+The Portals API is designed to allow for scalability, not to guarantee it.
+ Portals cannot overcome the shortcomings of a poorly designed application
+ program.
+ Applications that have inherent scalability limitations, either through
+ design or implementation, will not be transformed by Portals into scalable
+ applications.
+ Scalability must be addressed at all levels.
+ Portals do not inhibit scalability, but do not guarantee it either.
+\layout Standard
+
+To support scalability, the Portals interface maintains a minimal amount
+ of state.
+ Portals provide reliable, ordered delivery of messages between pairs of
+ processes.
+ They are connectionless: a process is not required to explicitly establish
+ a point-to-point connection with another process in order to communicate.
+ Moreover, all buffers used in the transmission of messages are maintained
+ in user space.
+ The target process determines how to respond to incoming messages, and
+ messages for which there are no buffers are discarded.
+\layout Section
+
+Communication Model
+\layout Standard
+
+Portals combine the characteristics of both one-side and two-sided communication.
+ They define a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching put
+\begin_inset Quotes erd
+\end_inset 
+
+ operation and a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching get
+\begin_inset Quotes erd
+\end_inset 
+
+ operation.
+ The destination of a put (or send) is not an explicit address; instead,
+ each message contains a set of match bits that allow the receiver to determine
+ where incoming messages should be placed.
+ This flexibility allows Portals to support both traditional one-sided operation
+s and two-sided send/receive operations.
+\layout Standard
+
+Portals allows the target to determine whether incoming messages are acceptable.
+ A target process can choose to accept message operations from any specific
+ process or can choose to ignore message operations from any specific process.
+\layout Section
+
+Zero Copy, OS Bypass and Application Bypass
+\layout Standard
+
+In traditional system architectures, network packets arrive at the network
+ interface card (NIC), are passed through one or more protocol layers in
+ the operating system, and eventually copied into the address space of the
+ application.
+ As network bandwidth began to approach memory copy rates, reduction of
+ memory copies became a critical concern.
+ This concern lead to the development of zero-copy message passing protocols
+ in which message copies are eliminated or pipelined to avoid the loss of
+ bandwidth.
+\layout Standard
+
+A typical zero-copy protocol has the NIC generate an interrupt for the CPU
+ when a message arrives from the network.
+ The interrupt handler then controls the transfer of the incoming message
+ into the address space of the appropriate application.
+ The interrupt latency, the time from the initiation of an interrupt until
+ the interrupt handler is running, is fairly significant.
+ To avoid this cost, some modern NICs have processors that can be programmed
+ to implement part of a message passing protocol.
+ Given a properly designed protocol, it is possible to program the NIC to
+ control the transfer of incoming messages, without needing to interrupt
+ the CPU.
+ Because this strategy does not need to involve the OS on every message
+ transfer, it is frequently called 
+\begin_inset Quotes eld
+\end_inset 
+
+OS Bypass.
+\begin_inset Quotes erd
+\end_inset 
+
+ ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+, FM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{FM2}
+
+\end_inset 
+
+, GM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{GM}
+
+\end_inset 
+
+, and Portals are examples of OS Bypass protocols.
+\layout Standard
+
+Many protocols that support OS Bypass still require that the application
+ actively participate in the protocol to ensure progress.
+ As an example, the long message protocol of PM requires that the application
+ receive and reply to a request to put or get a long message.
+ This complicates the runtime environment, requiring a thread to process
+ incoming requests, and significantly increases the latency required to
+ initiate a long message protocol.
+ The Portals message passing protocol does not require activity on the part
+ of the application to ensure progress.
+ We use the term 
+\begin_inset Quotes eld
+\end_inset 
+
+Application Bypass
+\begin_inset Quotes erd
+\end_inset 
+
+ to refer to this aspect of the Portals protocol.
+\layout Section
+
+Faults 
+\layout Standard
+
+Given the number of components that we are dealing with and the fact that
+ we are interested in supporting applications that run for very long times,
+ failures are inevitable.
+ The Portals API recognizes that the underlying transport may not be able
+ to successfully complete an operation once it has been initiated.
+ This is reflected in the fact that the Portals API reports three types
+ of events: events indicating the initiation of an operation, events indicating
+ the successful completion of an operation, and events indicating the unsuccessf
+ul completion of an operation.
+ Every initiation event is eventually followed by a successful completion
+ event or an unsuccessful completion event.
+\layout Standard
+
+Between the time an operation is started and the time that the operation
+ completes (successfully or unsuccessfully), any memory associated with
+ the operation should be considered volatile.
+ That is, the memory may be changed in unpredictable ways while the operation
+ is progressing.
+ Once the operation completes, the memory associated with the operation
+ will not be subject to further modification (from this operation).
+ Notice that unsuccessful operations may alter memory in an essentially
+ unpredictable fashion.
+\layout Chapter
+
+An Overview of the Portals API
+\begin_inset LatexCommand \label{sec:apiover}
+
+\end_inset 
+
+
+\layout Standard
+
+In this section, we give a conceptual overview of the Portals API.
+ The goal is to provide a context for understanding the detailed description
+ of the API presented in the next section.
+\layout Section
+
+Data Movement
+\begin_inset LatexCommand \label{sec:dmsemantics}
+
+\end_inset 
+
+
+\layout Standard
+
+A Portal represents an opening in the address space of a process.
+ Other processes can use a Portal to read (get) or write (put) the memory
+ associated with the portal.
+ Every data movement operation involves two processes, the 
+\series bold 
+initiator
+\series default 
+ and the 
+\series bold 
+target
+\series default 
+.
+ The initiator is the process that initiates the data movement operation.
+ The target is the process that responds to the operation by either accepting
+ the data for a put operation, or replying with the data for a get operation.
+\layout Standard
+
+In this discussion, activities attributed to a process may refer to activities
+ that are actually performed by the process or 
+\emph on 
+on behalf of the process
+\emph default 
+.
+ The inclusiveness of our terminology is important in the context of 
+\emph on 
+application bypass
+\emph default 
+.
+ In particular, when we note that the target sends a reply in the case of
+ a get operation, it is possible that reply will be generated by another
+ component in the system, bypassing the application.
+\layout Standard
+
+Figures\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:put}
+
+\end_inset 
+
+ and 
+\begin_inset LatexCommand \ref{fig:get}
+
+\end_inset 
+
+ present graphical interpretations of the Portal data movement operations:
+ put and get.
+ In the case of a put operation, the initiator sends a put request message
+ containing the data to the target.
+ The target translates the Portal addressing information in the request
+ using its local Portal structures.
+ When the request has been processed, the target optionally sends an acknowledge
+ment message.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename put.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Put (Send)
+\begin_inset LatexCommand \label{fig:put}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+In the case of a get operation, the initiator sends a get request to the
+ target.
+ As with the put operation, the target translates the Portal addressing
+ information in the request using its local Portal structures.
+ Once it has translated the Portal addressing information, the target sends
+ a reply that includes the requested data.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename get.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Get
+\begin_inset LatexCommand \label{fig:get}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+We should note that Portal address translations are only performed on nodes
+ that respond to operations initiated by other nodes.
+ Acknowledgements and replies to get operations bypass the portals address
+ translation structures.
+\layout Section
+
+Portal Addressing
+\begin_inset LatexCommand \label{subsec:paddress}
+
+\end_inset 
+
+
+\layout Standard
+
+One-sided data movement models (e.g., shmem\SpecialChar ~
+
+\begin_inset LatexCommand \cite{CraySHMEM}
+
+\end_inset 
+
+, ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, MPI-2\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPI2}
+
+\end_inset 
+
+) typically use a triple to address memory on a remote node.
+ This triple consists of a process id, memory buffer id, and offset.
+ The process id identifies the target process, the memory buffer id specifies
+ the region of memory to be used for the operation, and the offset specifies
+ an offset within the memory buffer.
+\layout Standard
+
+In addition to the standard address components (process id, memory buffer
+ id, and offset), a Portal address includes a set of match bits.
+ This addressing model is appropriate for supporting one-sided operations
+ as well as traditional two-sided message passing operations.
+ Specifically, the Portals API provides the flexibility needed for an efficient
+ implementation of MPI-1, which defines two-sided operations with one-sided
+ completion semantics.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:portals}
+
+\end_inset 
+
+ presents a graphical representation of the structures used by a target
+ in the interpretation of a Portal address.
+ The process id is used to route the message to the appropriate node and
+ is not reflected in this diagram.
+ The memory buffer id, called the 
+\series bold 
+portal id
+\series default 
+, is used as an index into the Portal table.
+ Each element of the Portal table identifies a match list.
+ Each element of the match list specifies two bit patterns: a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+don't care
+\begin_inset Quotes erd
+\end_inset 
+
+ bits, and a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+must match
+\begin_inset Quotes erd
+\end_inset 
+
+ bits.
+ In addition to the two sets of match bits, each match list element has
+ at most one memory descriptor.
+ Each memory descriptor identifies a memory region and an optional event
+ queue.
+ The memory region specifies the memory to be used in the operation and
+ the event queue is used to record information about these operations.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename portals.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 305pt
+       lyxheight 106pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Addressing Structures
+\begin_inset LatexCommand \label{fig:portals}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+ illustrates the steps involved in translating a Portal address, starting
+ from the first element in a match list.
+ If the match criteria specified in the match list entry are met and the
+ memory descriptor list accepts the operation
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Memory descriptors can reject operations because a threshold has been exceeded
+ or because the memory region does not have sufficient space, see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+
+, the operation (put or get) is performed using the memory region specified
+ in the memory descriptor.
+ If the memory descriptor specifies that it is to be unlinked when a threshold
+ has been exceeded, the match list entry is removed from the match list
+ and the resources associated with the memory descriptor and match list
+ entry are reclaimed.
+ Finally, if there is an event queue specified in the memory descriptor,
+ the operation is logged in the event queue.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename flow_new.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 447pt
+       lyxheight 282pt
+\end_inset 
+
+
+\layout Caption
+
+Portals Address Translation
+\begin_inset LatexCommand \label{fig:flow}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+If the match criteria specified in the match list entry are not met, or
+ there is no memory descriptor associated with the match list entry, or
+ the memory descriptor associated with the match list entry rejects the
+ operation, the address translation continues with the next match list entry.
+ If the end of the match list has been reached, the address translation
+ is aborted and the incoming requested is discarded.
+\layout Section
+
+Access Control
+\layout Standard
+
+A process can control access to its portals using an access control list.
+ Each entry in the access control list specifies a process id and a Portal
+ table index.
+ The access control list is actually an array of entries.
+ Each incoming request includes an index into the access control list (i.e.,
+ a 
+\begin_inset Quotes eld
+\end_inset 
+
+cookie
+\begin_inset Quotes erd
+\end_inset 
+
+ or hint).
+ If the id of the process issuing the request doesn't match the id specified
+ in the access control list entry or the Portal table index specified in
+ the request doesn't match the Portal table index specified in the access
+ control list entry, the request is rejected.
+ Process identifiers and Portal table indexes may include wild card values
+ to increase the flexibility of this mechanism.
+\layout Standard
+
+Two aspects of this design merit further discussion.
+ First, the model assumes that the information in a message header, the
+ sender's id in particular, is trustworthy.
+ In most contexts, we assume that the entity that constructs the header
+ is trustworthy; however, using cryptographic techniques, we could easily
+ devise a protocol that would ensure the authenticity of the sender.
+\layout Standard
+
+Second, because the access check is performed by the receiver, it is possible
+ that a malicious process will generate thousands of messages that will
+ be denied by the receiver.
+ This could saturate the network and/or the receiver, resulting in a 
+\emph on 
+denial of service
+\emph default 
+ attack.
+ Moving the check to the sender using capabilities, would remove the potential
+ for this form of attack.
+ However, the solution introduces the complexities of capability management
+ (exchange of capabilities, revocation, protections, etc).
+\layout Section
+
+Multi-threaded Applications
+\begin_inset LatexCommand \label{sec:threads}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports a generic view of multi-threaded applications.
+ From the perspective of the Portals API, an application program is defined
+ by a set of processes.
+ Each process defines a unique address space.
+ The Portals API defines access to this address space from other processes
+ (using portals addressing and the data movement operations).
+ A process may have one or more 
+\emph on 
+threads
+\emph default 
+ executing in its address space.
+\layout Standard
+
+With the exception of 
+\emph on 
+PtlEQWait
+\emph default 
+ every function in the Portals API is non-blocking and atomic with respect
+ to both other threads and external operations that result from data movement
+ operations.
+ While individual operations are atomic, sequences of these operations may
+ be interleaved between different threads and with external operations.
+ The Portals API does not provide any mechanisms to control this interleaving.
+ It is expected that these mechanisms will be provided by the API used to
+ create threads.
+\layout Chapter
+
+The Portals API
+\begin_inset LatexCommand \label{sec:api}
+
+\end_inset 
+
+
+\layout Section
+
+Naming Conventions
+\begin_inset LatexCommand \label{sec:conv}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API defines two types of entities: functions and types.
+ Function always start with 
+\emph on 
+Ptl
+\emph default 
+ and use mixed upper and lower case.
+ When used in the body of this report, function names appear in italic face,
+ e.g., 
+\emph on 
+PtlInit
+\emph default 
+.
+ The functions associated with an object type will have names that start
+ with 
+\emph on 
+Ptl
+\emph default 
+, followed by the two letter object type code shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ As an example, the function 
+\emph on 
+PtlEQAlloc
+\emph default 
+ allocates resources for an event queue.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Object Type Codes
+\begin_inset LatexCommand \label{tab:objcodes}
+
+\end_inset 
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+\backslash 
+medskip
+\newline 
+  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\emph on 
+xx
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+EQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Event Queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Memory Descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ ME 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Match list Entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Network Interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Type names use lower case with underscores to separate words.
+ Each type name starts with 
+\family typewriter 
+ptl
+\family default 
+_ and ends with 
+\family typewriter 
+_t
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+.
+\layout Standard
+
+Names for constants use upper case with underscores to separate words.
+ Each constant name starts with 
+\family typewriter 
+PTL_
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+PTL_OK
+\family default 
+.
+\layout Section
+
+Base Types
+\layout Standard
+
+The Portals API defines a variety of base types.
+ These types represent a simple renaming of the base types provided by the
+ C programming language.
+ In most cases these new type names have been introduced to improve type
+ safety and to avoid issues arising from differences in representation sizes
+ (e.g., 16-bit or 32-bit integers).
+\layout Subsection
+
+Sizes
+\begin_inset LatexCommand \label{sec:size-t}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_size_t
+\family default 
+ is an unsigned 64-bit integral type used for representing sizes.
+\layout Subsection
+
+Handles
+\begin_inset LatexCommand \label{sec:handle-type}
+
+\end_inset 
+
+\layout Standard
+
+Objects maintained by the API are accessed through handles.
+ Handle types have names of the form 
+\family typewriter 
+ptl_handle_
+\emph on 
+xx
+\emph default 
+_t
+\family default 
+, where 
+\emph on 
+xx
+\emph default 
+ is one of the two letter object type codes shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ For example, the type 
+\family typewriter 
+ptl_handle_ni_t
+\family default 
+ is used for network interface handles.
+\layout Standard
+
+Each type of object is given a unique handle type to enhance type checking.
+ The type, 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+, can be used when a generic handle is needed.
+ Every handle value can be converted into a value of type 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+ without loss of information.
+\layout Standard
+
+Handles are not simple values.
+ Every portals object is associated with a specific network interface and
+ an identifier for this interface (along with an object identifier) is part
+ of the handle for the object.
+\layout Standard
+
+The special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, of type 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+, is used to indicate the absence of an event queue.
+ See sections 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+ for uses of this value.
+\layout Subsection
+
+Indexes
+\begin_inset LatexCommand \label{sec:index-type}
+
+\end_inset 
+
+\layout Standard
+
+The types 
+\family typewriter 
+ptl_pt_index_t
+\family default 
+ and 
+\family typewriter 
+ptl_ac_index_t
+\family default 
+ are integral types used for representing Portal table indexes and access
+ control tables indexes, respectively.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+ for limits on values of these types.
+\layout Subsection
+
+Match Bits
+\begin_inset LatexCommand \label{sec:mb-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+ is capable of holding unsigned 64-bit integer values.
+\layout Subsection
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_interface_t
+\family default 
+ is an integral type used for identifying different network interfaces.
+ Users will need to consult the local documentation to determine appropriate
+ values for the interfaces available.
+ The special value 
+\family typewriter 
+PTL_IFACE_DEFAULT
+\family default 
+ identifies the default interface.
+\layout Subsection
+
+Identifiers
+\begin_inset LatexCommand \label{sec:id-type}
+
+\end_inset 
+
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_nid_t
+\family default 
+ is an integral type used for representing node ids
+\family typewriter 
+, ptl_pid_t
+\family default 
+ is an integral type for representing process ids, and 
+\family typewriter 
+ptl_uid_t 
+\family default 
+is an integral type for representing user ids.
+\layout Standard
+
+The special values 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ matches any process identifier, PTL_NID_ANY matches any node identifier,
+ and 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ matches any user identifier.
+ See sections 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+ for uses of these values.
+\layout Subsection
+
+Status Registers
+\begin_inset LatexCommand \label{sec:stat-type}
+
+\end_inset 
+
+
+\layout Standard
+
+Each network interface maintains an array of status registers that can be
+ accessed using the 
+\family typewriter 
+PtlNIStatus
+\family default 
+ function (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ The type 
+\family typewriter 
+ptl_sr_index_t
+\family default 
+ defines the types of indexes that can be used to access the status registers.
+ The only index defined for all implementations is 
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+ which identifies the status register that counts the dropped requests for
+ the interface.
+ Other indexes (and registers) may be defined by the implementation.
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_sr_value_t
+\family default 
+ defines the types of values held in status registers.
+ This is a signed integer type.
+ The size is implementation dependent, but must be at least 32 bits.
+\layout Section
+
+Initialization and Cleanup
+\begin_inset LatexCommand \label{sec:init}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API includes a function, 
+\emph on 
+PtlInit
+\emph default 
+, to initialize the library and a function, 
+\emph on 
+PtlFini
+\emph default 
+, to cleanup after the application is done using the library.
+\layout Subsection
+
+PtlInit
+\begin_inset LatexCommand \label{sec:ptlinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlInit( int *max_interfaces );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlInit
+\emph default 
+ function initializes the Portals library.
+ PtlInit must be called at least once by a process before any thread makes
+ a Portals function call, but may be safely called more than once.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_FAIL Indicates an error during initialization.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+max_interfaces
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+max_interfaces
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the maximum number of interfaces
+ that can be initialized.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlFini
+\begin_inset LatexCommand \label{sec:ptlfini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+void PtlFini( void );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlFini
+\emph default 
+ function cleans up after the Portals library is no longer needed by a process.
+ After this function is called, calls to any of the functions defined by
+ the Portal API or use of the structures set up by the Portals API will
+ result in undefined behavior.
+ This function should be called once and only once during termination by
+ a process.
+ Typically, this function will be called in the exit sequence of a process.
+ Individual threads should not call PtlFini when they terminate.
+\layout Section
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports the use of multiple network interfaces.
+ However, each interface is treated as an independent entity.
+ Combining interfaces (e.g., 
+\begin_inset Quotes eld
+\end_inset 
+
+bonding
+\begin_inset Quotes erd
+\end_inset 
+
+ to create a higher bandwidth connection) must be implemented by the application
+ or embedded in the underlying network.
+ Interfaces are treated as independent entities to make it easier to cache
+ information on individual network interface cards.
+\layout Standard
+
+Once initialized, each interface provides a Portal table, an access control
+ table, and a collection of status registers.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for a discussion of updating Portal table entries using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ function.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+ for a discussion of the initialization and updating of entries in the access
+ control table.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+ for a discussion of the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function which can be used to determine the value of a status register.
+\layout Standard
+
+Every other type of Portal object (e.g., memory descriptor, event queue, or
+ match list entry) is associated with a specific network interface.
+ The association to a network interface is established when the object is
+ created and is encoded in the handle for the object.
+\layout Standard
+
+Each network interface is initialized and shutdown independently.
+ The initialization routine, 
+\emph on 
+PtlNIInit
+\emph default 
+, returns a handle for an interface object which is used in all subsequent
+ Portal operations.
+ The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to shutdown an interface and release any resources that
+ are associated with the interface.
+ Network interface handles are associated with processes, not threads.
+ All threads in a process share all of the network interface handles.
+\layout Standard
+
+The Portals API also defines the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function to query the status registers for a network interface, the 
+\emph on 
+PtlNIDist
+\emph default 
+ function to determine the 
+\begin_inset Quotes eld
+\end_inset 
+
+distance
+\begin_inset Quotes erd
+\end_inset 
+
+ to another process, and the 
+\emph on 
+PtlNIHandle
+\emph default 
+ function to determine the network interface that an object is associated
+ with.
+\layout Subsection
+
+PtlNIInit
+\begin_inset LatexCommand \label{sec:niinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    int            max_match_entries;
+\newline 
+    int            max_mem_descriptors;
+\newline 
+    int            max_event_queues;
+\newline 
+    ptl_ac_index_t max_atable_index; 
+\newline 
+    ptl_pt_index_t max_ptable_index;
+\newline 
+} ptl_ni_limits_t;
+\newline 
+
+\newline 
+int PtlNIInit( ptl_interface_t  interface
+\newline 
+               ptl_pid_t        pid,
+\newline 
+               ptl_ni_limits_t* desired,
+\newline 
+               ptl_ni_limits_t* actual,
+\newline 
+               ptl_handle_ni_t* handle );
+\layout Standard
+
+Values of type 
+\family typewriter 
+ptl_ni_limits_t
+\family default 
+ include the following members:
+\layout Description
+
+max_match_entries Maximum number of match entries that can be allocated
+ at any one time.
+\layout Description
+
+max_mem_descriptors Maximum number of memory descriptors that can be allocated
+ at any one time.
+\layout Description
+
+max_event_queues Maximum number of event queues that can be allocated at
+ any one time.
+\layout Description
+
+max_atable_index Largest access control table index for this interface,
+ valid indexes range from zero to 
+\family typewriter 
+max_atable_index
+\family default 
+, inclusive.
+\layout Description
+
+max_ptable_index Largest Portal table index for this interface, valid indexes
+ range from zero to 
+\family typewriter 
+max_ptable_index
+\family default 
+, inclusive.
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIInit
+\emph default 
+ function is used to initialized the Portals API for a network interface.
+ This function must be called at least once by each process before any other
+ operations that apply to the interface by any process or thread.
+ For subsequent calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+ from within the same process (either by different threads or the same thread),
+ the desired limits will be ignored and the call will return the existing
+ NI handle.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INIT_DUP Indicates a duplicate initialization of 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INIT_INV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to initialize the
+ interface.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+pid
+\family default 
+ is not a valid process id.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+actual 
+\family default 
+or
+\family typewriter 
+ handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the network interface to be initialized.
+  (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+ for a discussion of  values used to identify network interfaces.)
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+pid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the desired process id (for well known process ids).
+ The value 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ may be used to have the process id assigned by the underlying library.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+desired
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If non-NULL, points to a structure that holds the desired limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+actual
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, the location pointed to by actual will hold the actual
+ limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the interface.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The use of desired is implementation dependent.
+ In particular, an implementation may choose to ignore this argument.
+\layout Subsection
+
+PtlNIFini
+\begin_inset LatexCommand \label{sec:nifini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIFini( ptl_handle_ni_t interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to release the resources allocated for a network interface.
+ Once the 
+\emph on 
+PtlNIFini
+\emph default 
+ operation has been started, the results of pending API operations (e.g.,
+ operations initiated by another thread) for this interface are undefined.
+ Similarly, the effects of incoming operations (puts and gets) or return
+ values (acknowledgements and replies) for this interface are undefined.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the interface to shutdown.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlNIStatus
+\begin_inset LatexCommand \label{sec:nistatus}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIStatus( ptl_handle_ni_t interface,
+\newline 
+                 ptl_sr_index_t  status_register,
+\newline 
+                 ptl_sr_value_t* status );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIStatus
+\emph default 
+ function returns the value of a status register for the specified interface.
+ (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+ for more information on status register indexes and status register values.)
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_SR_INDX Indicates that 
+\family typewriter 
+status_register
+\family default 
+ is not a valid status register.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+status
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status_register
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An index for the status register to read.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the current value of the status
+ register.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The only status register that must be defined is a drop count register (
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+).
+ Implementations may define additional status registers.
+ Identifiers for the indexes associated with these registers should start
+ with the prefix 
+\family typewriter 
+PTL_SR_
+\family default 
+.
+\layout Subsection
+
+PtlNIDist
+\layout LyX-Code
+
+int PtlNIDist( ptl_handle_ni_t  interface,
+\newline 
+               ptl_process_id_t process,
+\newline 
+               unsigned long*   distance );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIDist
+\emph default 
+ function returns the distance to another process using the specified interface.
+ Distances are only defined relative to an interface.
+ Distance comparisons between different interfaces on the same process may
+ be meaningless.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+process
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+distance
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+process
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An identifier for the process whose distance is being  requested.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+distance
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  distance to the remote
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+This function should return a static measure of distance.
+ Examples include minimum latency, the inverse of available bandwidth, or
+ the number of switches between the two endpoints.
+\layout Subsection
+
+PtlNIHandle
+\layout LyX-Code
+
+int PtlNIHandle( ptl_handle_any_t handle,
+\newline 
+                 ptl_handle_ni_t* interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIHandle
+\emph default 
+ function returns a handle for the network interface with which the object
+ identified by 
+\family typewriter 
+handle
+\family default 
+ is associated.
+ If the object identified by 
+\family typewriter 
+handle
+\family default 
+ is a network interface, this function returns the same value it is passed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_HANDLE Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a valid handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the object.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the network interface
+ associated with 
+\family typewriter 
+handle
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Every handle should encode the network interface and the object id relative
+ to this handle.
+ Both are presumably encoded using integer values.
+\layout Section
+
+User Identification
+\begin_inset LatexCommand \label{sec:uid}
+
+\end_inset 
+
+
+\layout Standard
+
+Every process runs on behalf of a user.
+\layout Subsection
+
+PtlGetUid
+\layout LyX-Code
+
+int PtlGetUid( ptl_handle_ni_t   ni_handle,
+\newline 
+               ptl_uid_t*        uid );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the user id for the calling
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that user identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, a process may have multiple
+ user identifiers.
+\layout Section
+
+Process Identification
+\begin_inset LatexCommand \label{sec:pid}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes that use the Portals API, can be identified using a node id and
+ process id.
+ Every node accessible through a network interface has a unique node identifier
+ and every process running on a node has a unique process identifier.
+ As such, any process in the computing system can be identified by its node
+ id and process id.
+\layout Standard
+
+The Portals API defines a type, 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ for representing process ids and a function, 
+\emph on 
+PtlGetId
+\emph default 
+, which can be used to obtain the id of the current process.
+\layout Comment
+
+The portals API does not include thread identifiers.
+  Messages are delivered to processes (address spaces) not threads (contexts
+ of  execution).
+\layout Subsection
+
+The Process Id Type
+\begin_inset LatexCommand \label{sec:pid-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_nid_t       nid; /* node id */
+\newline 
+    ptl_pid_t       pid; /* process id */
+\newline 
+} ptl_process_id_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ type uses two identifiers to represent a process id: a node id and a process
+ id.
+\layout Subsection
+
+PtlGetId
+\begin_inset LatexCommand \label{sub:PtlGetId}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGetId( ptl_handle_ni_t   ni_handle,
+\newline 
+              ptl_process_id_t* id );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+id
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the id for the calling process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that process identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, it may have multiple
+ node identifiers.
+\layout Section
+
+Match List Entries and Match Lists
+\begin_inset LatexCommand \label{sec:me}
+
+\end_inset 
+
+
+\layout Standard
+
+A match list is a chain of match list entries.
+ Each match list entry includes a memory descriptor and a set of match criteria.
+ The match criteria can be used to reject incoming requests based on process
+ id or the match bits provided in the request.
+ A match list is created using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ or 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ functions, which create a match list consisting of a single match list
+ entry, attaches the match list to the specified Portal index, and returns
+ a handle for the match list entry.
+ Match entries can be dynamically inserted and removed from a match list
+ using the 
+\emph on 
+PtlMEInsert
+\emph default 
+ and 
+\emph on 
+PtlMEUnlink
+\emph default 
+ functions.
+\layout Subsection
+
+PtlMEAttach
+\begin_inset LatexCommand \label{sec:meattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t;
+\newline 
+
+\layout LyX-Code
+
+typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t;
+\newline 
+
+\layout LyX-Code
+
+int PtlMEAttach( ptl_handle_ni_t  interface,
+\newline 
+                 ptl_pt_index_t   index,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_unlink_t     unlink,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+ are used to control where a new item is inserted.
+ The value 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+ is used to insert the new item before the current item or before the head
+ of the list.
+ The value 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+ is used to insert the new item after the current item or after the last
+ item in the list.
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttach
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to the Portal table for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PTINDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid Portal table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="7" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The Portal table index where the match list  should be attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specifies the match criteria for the process id of the requestor.
+  The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to  wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+match_bits, ignorebits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specify the match criteria to apply  to the match bits in the incoming request.
+  The 
+\family typewriter 
+ignorebits
+\family default 
+ are used to mask out insignificant bits in the incoming match bits.
+  The resulting bits are then compared to the match list entry's match 
+ bits to determine if the incoming request meets the match criteria.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates the match list entry should be unlinked when the last memory descripto
+r associated with this match list  entry is unlinked.
+  (Note, the check for unlinking a match entry  only occurs when a memory
+ descriptor is unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be prepended or appended to
+ the existing match list.
+ If there is no existing list, this argument is ignored and the new match
+ entry becomes the only entry in the list.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEAttachAny
+\begin_inset LatexCommand \label{sec:attachany}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEAttachAny( ptl_handle_ni_t  interface,
+\newline 
+                    ptl_pt_index_t   *index,
+\newline 
+                    ptl_process_id_t matchid,
+\newline 
+                    ptl_match_bits_t match_bits,
+\newline 
+                    ptl_match_bits_t ignorebits,
+\newline 
+                    ptl_unlink_t     unlink,
+\newline 
+                    ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttachAny
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to an unused Portal table entry for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_PT_FULL Indicates that there are no free entries in the Portal table.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On succesfful return, this location will hold the Portal index where the
+ match list  has been attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid, match_bits, ignorebits, unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEInsert
+\begin_inset LatexCommand \label{sec:meinsert}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEInsert( ptl_handle_me_t  current,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEInsert
+\emph default 
+ function creates a new match list entry and inserts this entry into the
+ match list containing 
+\family typewriter 
+current
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+current
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match entry.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+current
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for a match entry.
+  The new match entry will be inserted immediately before or immediately
+ after this match entry.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\family default 
+, 
+\family typewriter 
+match_bits
+\family default 
+, 
+\family typewriter 
+ignorebits
+\family default 
+,  
+\family typewriter 
+unlink
+\family default 
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion  for 
+\emph on 
+PtlMEAttach
+\emph default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be inserted before or after
+ the 
+\family typewriter 
+current
+\family default 
+ entry.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\emph on 
+PtlMEAttach
+\emph default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEUnlink
+\begin_inset LatexCommand \label{sec:meunlink}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEUnlink( ptl_handle_me_t entry );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMEUnlink
+\emph default 
+ function can be used to unlink a match entry from a match list.
+ This operation also releases any resources associated with the match entry
+ (including the associated memory descriptor).
+ It is an error to use the match entry handle after calling 
+\emph on 
+PtlMEUnlink
+\emph default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+entry
+\family default 
+ is not a valid match entry handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+entry
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the match entry to be unlinked.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Memory Descriptors
+\begin_inset LatexCommand \label{sec:md}
+
+\end_inset 
+
+
+\layout Standard
+
+A memory descriptor contains information about a region of an application
+ process' memory and an event queue where information about the operations
+ performed on the memory descriptor are recorded.
+ The Portals API provides two operations to create memory descriptors: 
+\emph on 
+PtlMDAttach
+\emph default 
+, and 
+\emph on 
+PtlMDBind
+\emph default 
+; an operation to update a memory descriptor, 
+\emph on 
+PtlMDUpdate
+\emph default 
+; and an operation to unlink and release the resources associated with a
+ memory descriptor, 
+\emph on 
+PtlMDUnlink
+\emph default 
+.
+\layout Subsection
+
+The Memory Descriptor Type
+\begin_inset LatexCommand \label{sec:md-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    void*           start;
+\newline 
+    ptl_size_t      length;
+\newline 
+    int             threshold;
+\newline 
+    unsigned int    max_offset;
+\newline 
+    unsigned int    options;
+\newline 
+    void*           user_ptr;
+\newline 
+    ptl_handle_eq_t eventq;
+\newline 
+} ptl_md_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_md_t
+\family default 
+ type defines the application view of a memory descriptor.
+ Values of this type are used to initialize and update the memory descriptors.
+\layout Subsubsection
+
+Members
+\layout Description
+
+start,\SpecialChar ~
+length Specify the memory region associated with the memory descriptor.
+ The 
+\family typewriter 
+start
+\family default 
+ member specifies the starting address for the memory region and the 
+\family typewriter 
+length
+\family default 
+ member specifies the length of the region.
+ The 
+\family typewriter 
+start member
+\family default 
+ can be NULL provided that the 
+\family typewriter 
+length
+\family default 
+ member is zero.
+ (Zero length buffers are useful to record events.) There are no alignment
+ restrictions on the starting address or the length of the region; although,
+ unaligned messages may be slower (i.e., lower bandwidth and/or longer latency)
+ on some implementations.
+\layout Description
+
+threshold Specifies the maximum number of operations that can be performed
+ on the memory descriptor.
+ An operation is any action that could possibly generate an event (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+ for the different types of events).
+ In the usual case, the threshold value is decremented for each operation
+ on the memory descriptor.
+ When the threshold value is zero, the memory descriptor is 
+\emph on 
+inactive
+\emph default 
+, and does not respond to operations.
+ A memory descriptor can have an initial threshold value of zero to allow
+ for manipulation of an inactive memory descriptor by the local process.
+ A threshold value of 
+\family typewriter 
+PTL_MD_THRESH_INF
+\family default 
+ indicates that there is no bound on the number of operations that may be
+ applied to a memory descriptor.
+ Note that local operations (e.g., 
+\emph on 
+PtlMDUpdate
+\emph default 
+) are not applied to the threshold count.
+\layout Description
+
+max_offset Specifies the maximum local offset of a memory descriptor.
+ When the local offset of a memory descriptor exceeds this maximum, the
+ memory descriptor becomes 
+\shape italic 
+inactive
+\shape default 
+ and does not respond to further operations.
+\layout Description
+
+options Specifies the behavior of the memory descriptor.
+ There are five options that can be selected: enable put operations (yes
+ or no), enable get operations (yes or no), offset management (local or
+ remote), message truncation (yes or no), and acknowledgement (yes or no).
+ Values for this argument can be constructed using a bitwise or of the following
+ values: 
+\begin_deeper 
+\begin_deeper 
+\layout Description
+
+PTL_MD_OP_PUT Specifies that the memory descriptor will respond to 
+\emph on 
+put
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+put
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_OP_GET Specifies that the memory descriptor will respond to 
+\emph on 
+get
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+get
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory
+ region is provided by the incoming request.
+ By default, the offset is maintained locally.
+ When the offset is maintained locally, the offset is incremented by the
+ length of the request so that the next operation (put and/or get) will
+ access the next part of the memory region.
+\layout Description
+
+PTL_MD_TRUNCATE Specifies that the length provided in the incoming request
+ can be reduced to match the memory available in the region.
+ (The memory available in a memory region is determined by subtracting the
+ offset from the length of the memory region.) By default, if the length
+ in the incoming operation is greater than the amount of memory available,
+ the operation is rejected.
+\layout Description
+
+PTL_MD_ACK_DISABLE Specifies that an acknowledgement should 
+\emph on 
+not
+\emph default 
+ be sent for incoming 
+\emph on 
+put
+\emph default 
+ operations, even if requested.
+ By default, acknowledgements are sent for 
+\emph on 
+put
+\emph default 
+ operations that request an acknowledgement.
+ Acknowledgements are never sent for 
+\emph on 
+get
+\emph default 
+ operations.
+ The value sent in the reply serves as an implicit acknowledgement.
+\end_deeper 
+\layout Standard
+
+
+\series bold 
+Note
+\series default 
+: It is not considered an error to have a memory descriptor that does not
+ respond to either 
+\emph on 
+put
+\emph default 
+ or 
+\emph on 
+get
+\emph default 
+ operations: Every memory descriptor responds to 
+\emph on 
+reply
+\emph default 
+ operations.
+ Nor is it considered an error to have a memory descriptor that responds
+ to both 
+\emph on 
+put
+\emph default 
+ and 
+\emph on 
+get
+\emph default 
+ operations.
+\end_deeper 
+\layout Description
+
+user_ptr A user-specified value that is associated with the memory descriptor.
+ The value does not need to be a pointer, but must fit in the space used
+ by a pointer.
+ This value (along with other values) is recorded in events associated with
+ operations on this memory descriptor.
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Tying the memory descriptor to a user-defined value can be useful when multiple
+ memory descriptor share the same event queue or when the memory descriptor
+ needs to be associated with a data structure maintained by the application.
+ For example, an MPI implementation can set the 
+\family typewriter 
+user_ptr
+\family default 
+ argument to the value of an MPI Request.
+ This direct association allows for processing of memory descriptor's by
+ the MPI implementation without a table lookup or a search for the appropriate
+ MPI Request.
+\end_inset 
+
+
+\layout Description
+
+eventq A handle for the event queue used to log the operations performed
+ on the memory region.
+ If this argument is 
+\family typewriter 
+PTl_EQ_NONE
+\family default 
+, operations performed on this memory descriptor are not logged.
+\layout Subsection
+
+PtlMDAttach
+\begin_inset LatexCommand \label{sec:mdattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDAttach( ptl_handle_me_t  match,
+\newline 
+                 ptl_md_t         mem_desc,
+\newline 
+                 ptl_unlink_t     unlink_op,
+\newline 
+                 ptl_unlink_t     unlink_nofit,
+\newline 
+                 ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_unlink_t
+\family default 
+ are used to control whether an item is unlinked from a list.
+ The value 
+\family typewriter 
+PTL_UNLINK
+\family default 
+ enables unlinking.
+ The value 
+\family typewriter 
+PTL_RETAIN
+\family default 
+ disables unlinking.
+\layout Standard
+
+The 
+\emph on 
+PtlMDAttach
+\emph default 
+ operation is used to create a memory descriptor and attach it to a match
+ list entry.
+ An error code is returned if this match list entry already has an associated
+ memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INUSE Indicates that 
+\family typewriter 
+match
+\family default 
+ already has a memory descriptor attached.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+match
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface associated with 
+\family typewriter 
+match
+\family default 
+.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the match entry that the memory descriptor will be associated
+ with.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_op
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when it becomes
+ inactive, either because the operation threshold drops to zero or because
+ the maximum offset has been exceeded.
+  (Note, the check for unlinking a memory descriptor only occurs after a
+ the completion of a successful operation.
+  If the threshold is set to zero during initialization or  using 
+\emph on 
+PtlMDUpdate
+\emph default 
+, the memory descriptor is 
+\series bold 
+not
+\series default 
+  unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_nofit
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when the space
+ remaining in the memory descriptor is not sufficient for a matching operation.
+ If an incoming message arrives arrives at a memory descriptor that does
+ not have sufficient space and the 
+\series bold 
+PTL_MD_TRUNCATE
+\series default 
+ operation is not specified, the memory descriptor will be unlinked.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument can be NULL, in which case the handle will not be returned.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDBind
+\begin_inset LatexCommand \label{sec:mdbind}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDBind( ptl_handle_ni_t  interface,
+\newline 
+               ptl_md_t         mem_desc,
+\newline 
+               ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDBind
+\emph default 
+ operation is used to create a 
+\begin_inset Quotes eld
+\end_inset 
+
+free floating
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor, i.e., a memory descriptor that is not associated with
+ a match list entry.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface, 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that the event queue associated with 
+\family typewriter 
+mem_desc
+\family default 
+ is not valid.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the network interface with which the memory descriptor will
+ be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the  memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument must be a valid address and cannot be NULL.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUnlink
+\begin_inset LatexCommand \label{sec:mdfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUnlink( ptl_handle_md_t mem_desc );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUnlink
+\emph default 
+ function unlinks the memory descriptor from any match list entry it may
+ be linked to and releases the resources associated with a memory descriptor.
+ (This function does not free the memory region associated with the memory
+ descriptor.) This function also releases the resources associated with a
+ floating memory descriptor.
+ Only memory descriptors with no pending operations may be unlinked.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_MD_INUSE Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ has pending operations and cannot be unlinked.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUpdate
+\begin_inset LatexCommand \label{sec:mdupdate}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUpdate( ptl_handle_md_t mem_desc,
+\newline 
+                 ptl_md_t*       old_md,
+\newline 
+                 ptl_md_t*       new_md,
+\newline 
+                 ptl_handle_eq_t testq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function provides a conditional, atomic update operation for memory descriptors.
+ The memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is only updated if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ The intent is to only enable updates to the memory descriptor when no new
+ messages have arrived since the last time the queue was checked.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+ for an example of how this function can be used.
+\layout Standard
+
+If 
+\family typewriter 
+new
+\family default 
+ is not NULL the memory descriptor identified by handle will be updated
+ to reflect the values in the structure pointed to by 
+\family typewriter 
+new
+\family default 
+ if 
+\family typewriter 
+testq
+\family default 
+ has the value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+ or if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ If 
+\family typewriter 
+old
+\family default 
+ is not NULL, the current value of the memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is recorded in the location identified by 
+\family typewriter 
+old
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_NOUPDATE Indicates that the update was not performed because 
+\family typewriter 
+testq
+\family default 
+ was not empty.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_ILL_MD Indicates that the value pointed to by 
+\family typewriter 
+new
+\family default 
+ is not a legal memory descriptor (e.g., the memory region specified by the
+ memory descriptor may be invalid).
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+testq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+new
+\family default 
+ or 
+\family typewriter 
+old
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+old_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+old_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, the current value of the memory descriptor will be stored in the location
+ identified by 
+\family typewriter 
+old
+\family default 
+_md.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+new_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+new_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, this argument provides the new values for the memory descriptor, if the
+ update is performed.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+testq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for an event queue used to predicate the update.
+ If 
+\family typewriter 
+testq
+\family default 
+ is equal to 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, the update is performed unconditionally.
+  Otherwise, the update is performed if and only if 
+\family typewriter 
+testq
+\family default 
+ is empty.
+  If the update is  not performed, the function returns the value 
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+.
+  (Note, the 
+\family typewriter 
+testq
+\family default 
+ argument does not need to be the same as  the event queue associated with
+ the memory descriptor.)
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Standard
+
+The conditional update can be used to ensure that the memory descriptor
+ has not changed between the time it was examined and the time it is updated.
+ In particular, it is needed to support an MPI implementation where the
+ activity of searching an unexpected message queue and posting a receive
+ must be atomic.
+\layout Section
+
+Events and Event Queues
+\begin_inset LatexCommand \label{sec:eq}
+
+\end_inset 
+
+
+\layout Standard
+
+Event queues are used to log operations performed on memory descriptors.
+ They can also be used to hold acknowledgements for completed 
+\emph on 
+put
+\emph default 
+ operations and to note when the data specified in a 
+\emph on 
+put
+\emph default 
+ operation has been sent (i.e., when it is safe to reuse the buffer that holds
+ this data).
+ Multiple memory descriptors can share a single event queue.
+\layout Standard
+
+In addition to the 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+ type, the Portals API defines two types associated with events: The 
+\family typewriter 
+
+\newline 
+ptl_event_kind_t
+\family default 
+ type defines the kinds of events that can be stored in an event queue.
+ The 
+\family typewriter 
+ptl_event_t
+\family default 
+ type defines a structure that holds the information associated with an
+ event.
+\layout Standard
+
+The Portals API also provides four functions for dealing with event queues:
+ The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to allocate the API resources needed for an event queue,
+ the 
+\emph on 
+PtlEQFree
+\emph default 
+ function is used to release these resources, the 
+\emph on 
+PtlEQGet
+\emph default 
+ function can be used to get the next event from an event queue, and the
+\emph on 
+PtlEQWait
+\emph default 
+ function can be used to block a process (or thread) until an event queue
+ has at least one event.
+\layout Subsection
+
+Kinds of Events
+\begin_inset LatexCommand \label{sec:ek-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { 
+\newline 
+    PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL,
+\newline 
+    PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL,
+\newline 
+    PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL,
+\newline 
+    PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL,
+\newline 
+    PTL_EVENT_ACK,
+\newline 
+    PTL_EVENT_UNLINK
+\newline 
+} ptl_event_kind_t;
+\layout Standard
+\noindent 
+The Portals API defines fourteen types of events that can be logged in an
+ event queue: 
+\layout Description
+
+PTL_EVENT_GET_START A remote 
+\emph on 
+get
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_GET_END A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed successfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_GET_FAIL A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed unsuccessfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_PUT_START A remote 
+\emph on 
+put
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should should be considered
+ volatile until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_PUT_END A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed successfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_PUT_FAIL A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed unsuccessfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_REPLY_START A 
+\emph on 
+reply
+\emph default 
+ operation has been started on the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_END A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed successfully .
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_FAIL A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed unsuccessfully.
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_ACK An 
+\emph on 
+acknowledgement
+\emph default 
+ was received.
+ This event is logged when the acknowledgement is received 
+\layout Description
+
+PTL_EVENT_SEND_START An outgoing 
+\emph on 
+send
+\emph default 
+ operation has been started.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_SEND_END A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed successfully.
+ This event is logged after the entire buffer has been sent and it is safe
+ for the application to reuse the buffer.
+\layout Description
+
+PTL_EVENT_SEND_FAIL A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed unsuccessfully.
+ The process can safely manipulate the memory or free the memory descriptor
+ once it sees this event.
+\layout Description
+
+PTL_EVENT_UNLINK A memory descriptor associated with this event queue has
+ been automatically unlinked.
+ This event is not generated when a memory descriptor is explicitly unlinked
+ by calling 
+\shape italic 
+PtlMDUnlink
+\shape default 
+.
+ This event does not decrement the threshold count.
+\layout Subsection
+
+Event Ordering
+\layout Standard
+
+The Portals API guarantees that a when a process initiates two operations
+ on a remote process, the operations will be initiated on the remote process
+ in the same order that they were initiated on the original process.
+ As an example, if process A intitates two 
+\emph on 
+put
+\emph default 
+ operations, 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+, on process B, the Portals API guarantees that process A will receive the
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ in the same order that process B receives the 
+\family typewriter 
+PTL_EVENT_PUT_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+.
+ Notice that the API does not guarantee that the start events will be delivered
+ in the same order that process A initiated the 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ operations.
+ If process A needs to ensure the ordering of these operations, it should
+ include code to wait for the initiation of 
+\emph on 
+x
+\emph default 
+ before it initiates 
+\emph on 
+y
+\emph default 
+.
+\layout Subsection
+
+Failure Notification
+\layout Standard
+
+Operations may fail to complete successfully; however, unless the node itself
+ fails, every operation that is started will eventually complete.
+ While an operation is in progress, the memory associated with the operation
+ should not be viewed (in the case of a put or a reply) or altered (in the
+ case of a send or get).
+ Operation completion, whether successful or unsuccessful, is final.
+ That is, when an operation completes, the memory associated with the operation
+ will no longer be read or altered by the operation.
+ A network interface can use the 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+ to define more specific information regarding the failure of the operation
+ and record this information in the 
+\family typewriter 
+ni_fail_type
+\family default 
+ field of the event.
+\layout Subsection
+
+The Event Type
+\begin_inset LatexCommand \label{sec:event-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_event_kind_t      type;
+\newline 
+    ptl_process_id_t      initiator;
+\newline 
+    ptl_uid_t             uid;
+\layout LyX-Code
+
+    ptl_pt_index_t        portal;
+\newline 
+    ptl_match_bits_t      match_bits;
+\newline 
+    ptl_size_t            rlength;
+\newline 
+    ptl_size_t            mlength;
+\newline 
+    ptl_size_t            offset; 
+\newline 
+    ptl_handle_md_t       md_handle;
+\newline 
+    ptl_md_t              mem_desc;
+\newline 
+    ptl_hdr_data_t        hdr_data;
+\newline 
+    ptl_seq_t             link;
+\newline 
+    ptl_ni_fail_t         ni_fail_type;
+\newline 
+    volatile ptl_seq_t    sequence;
+\newline 
+} ptl_event_t;
+\layout Standard
+\noindent 
+An event structure includes the following members: 
+\layout Description
+
+type Indicates the type of the event.
+\layout Description
+
+initiator The id of the initiator.
+\layout Description
+
+portal The Portal table index specified in the request.
+\layout Description
+
+match_bits A copy of the match bits specified in the request.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for more information on match bits.
+\layout Description
+
+rlength The length (in bytes) specified in the request.
+\layout Description
+
+mlength The length (in bytes) of the data that was manipulated by the operation.
+ For truncated operations, the manipulated length will be the number of
+ bytes specified by the memory descriptor (possibly with an offset) operation.
+ For all other operations, the manipulated length will be the length of
+ the requested operation.
+\layout Description
+
+offset Is the displacement (in bytes) into the memory region that the operation
+ used.
+ The offset can be determined by the operation (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+) for a remote managed memory descriptor, or by the local memory descriptor
+ (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+).
+\layout Description
+
+md_handle Is the handle to the memory descriptor associated with the event.
+\layout Description
+
+mem_desc Is the state of the memory descriptor immediately after the event
+ has been processed.
+\layout Description
+
+hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+).
+\layout Description
+
+link The 
+\emph on 
+link
+\emph default 
+ member is used to link 
+\family typewriter 
+START
+\family default 
+ events with the 
+\family typewriter 
+END
+\family default 
+ or 
+\family typewriter 
+FAIL
+\family default 
+ event that signifies completion of the operation.
+ The 
+\emph on 
+link
+\emph default 
+ member will be the same for the two events associated with an operation.
+ The link member is also used to link an 
+\family typewriter 
+UNLINK
+\family default 
+ event with the event that caused the memory descriptor to be unlinked.
+\layout Description
+
+sequence The sequence number for this event.
+ Sequence numbers are unique to each event.
+\layout Comment
+
+The 
+\emph on 
+sequence
+\emph default 
+ member is the last member and is volatile to support SMP implementations.
+ When an event structure is filled in, the 
+\emph on 
+sequence
+\emph default 
+ member should be written after all other members have been updated.
+ Moreover, a memory barrier should be inserted between the updating of other
+ members and the updating of the 
+\emph on 
+sequence
+\emph default 
+ member.
+\layout Subsection
+
+PtlEQAlloc
+\begin_inset LatexCommand \label{sec:eqalloc}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQAlloc( ptl_handle_ni_t  interface,
+\newline 
+                ptl_size_t       count,
+\newline 
+                ptl_handle_eq_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to build an event queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ event queue.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface with which the event queue  will be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+count
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The number of events that can be stored in the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQFree
+\begin_inset LatexCommand \label{sec:eqfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQFree( ptl_handle_eq_t eventq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQFree
+\emph default 
+ function releases the resources associated with an event queue.
+ It is up to the user to insure that no memory descriptors are associated
+ with the event queue once it is freed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the event queue to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQGet
+\begin_inset LatexCommand \label{sec:eqget}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQGet( ptl_handle_eq_t eventq,
+\newline 
+              ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQGet
+\emph default 
+ function is a nonblocking function that can be used to get the next event
+ in an event queue.
+ The event is removed from the queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_EQ_EMPTY Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is empty or another thread is waiting on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQWait
+\begin_inset LatexCommand \label{sec:eqwait}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQWait( ptl_handle_eq_t eventq,
+\newline 
+               ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQWait
+\emph default 
+ function can be used to block the calling process (thread) until there
+ is an event in an event queue.
+ This function also returns the next event in the event queue and removes
+ this event from the queue.
+ This is the only blocking operation in the Portals 3.2 API.
+ In the event that multiple threads are waiting on the same event queue,
+ PtlEQWait is guaranteed to wake exactly one thread, but the order in which
+ they are awakened is not specified.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+ queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+\noindent 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue to wait on.
+  The calling process (thread) will be blocked until 
+\family typewriter 
+eventq
+\family default 
+ is not empty.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+The Access Control Table
+\begin_inset LatexCommand \label{sec:ac}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes can use the access control table to control which processes are
+ allowed to perform operations on Portal table entries.
+ Each communication interface has a Portal table and an access control table.
+ The access control table for the default interface contains an entry at
+ index zero that allows all processes with the same user id to communicate.
+ Entries in the access control table can be manipulated using the 
+\emph on 
+PtlACEntry
+\emph default 
+ function.
+\layout Subsection
+
+PtlACEntry
+\begin_inset LatexCommand \label{sec:acentry}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlACEntry( ptl_handle_ni_t  interface,
+\newline 
+                ptl_ac_index_t   index,
+\newline 
+                ptl_process_id_t matchid,
+\newline 
+                ptl_uid_t        user_id,
+\newline 
+                ptl_pt_index_t   portal );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlACEntry
+\emph default 
+ function can be used to update an entry in the access control table for
+ an interface.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_AC_INV_INDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid access control table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_PT_INV_INDEX Indicates that 
+\family typewriter 
+portal
+\family default 
+ is not a valid Portal table index.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index of the entry in the access control table to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the process(es) that are allowed to  perform operations.
+ The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+user_id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the user that is allowed to  perform operations.
+ The value 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ can be used to wildcard the user.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the Portal index(es) that can be used.
+  The value 
+\family typewriter 
+PTL_PT_INDEX_ANY
+\family default 
+ can be used to wildcard the  Portal index.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Data Movement Operations
+\begin_inset LatexCommand \label{sec:datamovement}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API provides two data movement operations: 
+\emph on 
+PtlPut
+\emph default 
+ and 
+\emph on 
+PtlGet
+\emph default 
+.
+\layout Subsection
+
+PtlPut
+\begin_inset LatexCommand \label{sec:put}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t;
+\newline 
+
+\newline 
+int PtlPut( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_ack_req_t    ack_req,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset,
+\newline 
+            ptl_hdr_data_t   hdr_data );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ack_req_t
+\family default 
+ are used to control whether an acknowledgement should be sent when the
+ operation completes (i.e., when the data has been written to a memory descriptor
+ of the 
+\family typewriter 
+target
+\family default 
+ process).
+ The value 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+ requests an acknowledgement, the value 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+ requests that no acknowledgement should be generated.
+\layout Standard
+
+The 
+\emph on 
+PtlPut
+\emph default 
+ function initiates an asynchronous put operation.
+ There are several events associated with a put operation: initiation of
+ the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+), completion of the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_END
+\family default 
+ or 
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\family default 
+), and, when the send completes successfully, the receipt of an acknowledgement
+ (
+\family typewriter 
+PTL_EVENT_ACK
+\family default 
+) indicating that the operation was accepted by the target.
+ These events will be logged in the event queue associated with the memory
+ descriptor (
+\family typewriter 
+mem_desc
+\family default 
+) used in the put operation.
+ Using a memory descriptor that does not have an associated event queue
+ results in these events being discarded.
+ In this case, the application must have another mechanism (e.g., a higher
+ level protocol) for determining when it is safe to modify the memory region
+ associated with the memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="8" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory to be sent.
+  If the memory descriptor has an event queue  associated with it, it will
+ be used to record events when the  message has been sent (PTL_EVENT_SEND_START,
+ PTL_EVENT_SEND_END).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ack_req
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Controls whether an acknowledgement event is requested.
+  Acknowledgements are only sent when they are requested by the initiating
+ process 
+\series bold 
+and
+\series default 
+ the memory descriptor has an event queue 
+\series bold 
+and
+\series default 
+ the target memory descriptor enables them.
+ Allowed constants: 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+, 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+hdr_data
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+64 bits of user data that can be included in message header.
+  This data is written to an event queue entry at the target if an event
+ queue is present on the matching memory descriptor.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlGet
+\begin_inset LatexCommand \label{sec:get}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGet( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlGet
+\emph default 
+ function initiates a remote read operation.
+ There are two event pairs associated with a get operation , when the data
+ is sent from the remote node, a 
+\family typewriter 
+PTL_EVENT_GET{START|END}
+\family default 
+ event pair is registered on the remote node; and when the data is returned
+ from the remote node a 
+\family typewriter 
+PTL_EVENT_REPLY{START|END}
+\family default 
+ event pair is registered on the local node.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="6" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory into which
+ the requested data will be received.
+  The memory descriptor can have an event queue associated with it to record
+ events, such as when the message receive has started (
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+_
+\family typewriter 
+START
+\family default 
+).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Summary
+\layout Standard
+
+
+\begin_inset LatexCommand \label{sec:summary}
+
+\end_inset 
+
+ We conclude this section by summarizing the names introduced by the Portals
+ 3.2 API.
+ We start by summarizing the names of the types introduced by the API.
+ This is followed by a summary of the functions introduced by the API.
+ Which is followed by a summary of the function return codes.
+ Finally, we conclude with a summary of the other constant values introduced
+ by the API.
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+ presents a summary of the types defined by the Portals API.
+ The first column in this table gives the type name, the second column gives
+ a brief description of the type, the third column identifies the section
+ where the type is defined, and the fourth column lists the functions that
+ have arguments of this type.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Types Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:types}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\noindent 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="25" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2in">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.2in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Sect
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Functions 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for an access control table 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlACEntry, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+acknowledgement request types 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlPut
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+kinds of events
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+information about events 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+plt_seq_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+event sequence number
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_any_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for any object 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for event queues 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert,
+ PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_me_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for match entries 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_ni_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut,
+ PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+node identifiers
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlGetId,PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+process identifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetId, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user indentifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetUid, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+insertion position (before or after) 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+identifiers for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+match (and ignore) bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mb-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ni_fail_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+network interface-specific failures
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+process identifiers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for Portal tables 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+sizes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:size-t}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_value_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+values in status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+unlink options 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+ presents a summary of the functions defined by the Portals API.
+ The first column in this table gives the name for the function, the second
+ column gives a brief description of the operation implemented by the function,
+ and the third column identifies the section where the function is defined.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Functions Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:func}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="24" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlACEntry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update an entry in an access control table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQAlloc 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the next event from an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQFree 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ release the resources for an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQWait 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ wait for a new event in an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a get operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGetId 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the id for the current process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a memory descriptor and attach it to a match entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDBind 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a free-floating memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a memory descriptor from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUpdate 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a Portal table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a free Portal table entry
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:attachany}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEInsert 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a match entry and insert it in a list 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a match entry from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIDist 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the distance to another process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIHandle 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the network interface handle for an object 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIStatus 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ read a network interface status register 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlPut 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a put operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+ summarizes the return codes used by functions defined by the Portals API.
+ All of these constants are integer values.
+ The first column of this table gives the symbolic name for the constant,
+ the second column gives a brief description of the value, and the third
+ column identifies the functions that can return this value.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Function Return Codes for the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:retcodes}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="27" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.6in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Functions
+\series default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_AC_INV_INDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_DROPPED
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+at least one event has been dropped 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet, PtlWait 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_EMPTY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no events available in an event queue 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_FAIL 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+error during initialization or cleanup 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlInit, PtlFini 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ILL_MD
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+illegal memory descriptor values 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDBind, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_DUP 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+duplicate initialization of an interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_INV
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initialization of an invalid interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+the ME already has an MD
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ASIZE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table size 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_EQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUpdate, PtlEQFree, PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_HANDLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid memory descriptor handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUnlink, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ME
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid match entry handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid network interface handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PROC 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid process identifier 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PTINDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid Portal table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_REG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_SR_INDX 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ML_TOOLONG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match list too long 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+MD has pending operations
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMDUnlink
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOINIT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+uninitialized API 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\emph default 
+, except PtlInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOSPACE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insufficient memory 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ no update was performed 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_FULL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Portal table is full
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_OK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ success 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SEGV 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+addressing violation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate,
+ PtlEQAlloc, PtlEQGet, PtlEQWait 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ summarizes the remaining constant values introduced by the Portals API.
+ The first column in this table presents the symbolic name for the constant,
+ the second column gives a brief description of the value, the third column
+ identifies the type for the value, and the fourth column identifies the
+ sections in which the value is mentioned.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Other Constants Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:oconsts}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="36" columns="5">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Base type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Intr.
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Ref.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request an acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_NONE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a NULL event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_UNLINK
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+unlink event
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PID_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for process id fields 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for node id fields
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for user id
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_IFACE_DEFAULT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+default interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_AFTER 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert after 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_BEFORE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert before 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_ACK_DISABLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to disable acknowledgements 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_MANAGE_REMOTE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable the use of remote offsets 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_GET 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable get operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_PUT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable put operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_THRESH_INF 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+infinite threshold for a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_TRUNCATE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable truncation of a request 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOACK_REQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request no acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_INDEX_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for Portal indexes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_RETAIN 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+disable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SR_DROP_COUNT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+index for the dropped count register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UNLINK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+enable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Chapter
+
+The Semantics of Message Transmission
+\begin_inset LatexCommand \label{sec:semantics}
+
+\end_inset 
+
+
+\layout Standard
+
+The portals API uses four types of messages: put requests, acknowledgements,
+ get requests, and replies.
+ In this section, we describe the information passed on the wire for each
+ type of message.
+ We also describe how this information is used to process incoming messages.
+\layout Section
+
+Sending Messages
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:put-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a put request.
+ The first column provides a descriptive name for the information, the second
+ column provides the type for this information, the third column identifies
+ the source of the information, and the fourth column provides additional
+ notes.
+ Most information that is transmitted is obtained directly from the 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ Notice that the handle for the memory descriptor used in the 
+\emph on 
+PtlPut
+\emph default 
+ operation is transmitted even though this value cannot be interpreted by
+ the target.
+ A value of anything other than 
+\family typewriter 
+PTL_MD_NONE
+\family default 
+, is interpreted as a request for an acknowledgement.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Put Request
+\begin_inset LatexCommand \label{tab:put-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="12" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlPut
+\emph default 
+ arg
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a put request 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no ack if 
+\family typewriter 
+PTL_MD_NONE
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family roman 
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+start
+\family default 
+ and 
+\family typewriter 
+length
+\family default 
+ members 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:ack-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in an acknowledgement.
+ Most of the information is simply echoed from the put request.
+ Notice that the initiator and target are obtained directly from the put
+ request, but are swapped in generating the acknowledgement.
+ The only new piece of information in the acknowledgement is the manipulated
+ length which is determined as the put request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in an Acknowledgement
+\begin_inset LatexCommand \label{tab:ack-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="10" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:get-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a get request.
+ Like the information transmitted in a put request, most of the information
+ transmitted in a get request is obtained directly from the 
+\emph on 
+PtlGet
+\emph default 
+ operation.
+ Unlike put requests, get requests do not include the event queue handle.
+ In this case, the reply is generated whenever the operation succeeds and
+ the memory descriptor must not be unlinked until the reply is received.
+ As such, there is no advantage to explicitly sending the event queue handle.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Get Request
+\begin_inset LatexCommand \label{tab:get-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlGet
+\emph default 
+ argument
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a get operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:reply-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in a reply.
+ Like an acknowledgement, most of the information is simply echoed from
+ the get request.
+ The initiator and target are obtained directly from the get request, but
+ are swapped in generating the acknowledgement.
+ The only new information in the acknowledgement are the manipulated length
+ and the data, which are determined as the get request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Reply
+\begin_inset LatexCommand \label{tab:reply-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Section
+
+Receiving Messages
+\begin_inset LatexCommand \label{sec:receiving}
+
+\end_inset 
+
+
+\layout Standard
+
+When an incoming message arrives on a network interface, the communication
+ system first checks that the target process identified in the request is
+ a valid process that has initialized the network interface (i.e., that the
+ target process has a valid Portal table).
+ If this test fails, the communication system discards the message and increment
+s the dropped message count for the interface.
+ The remainder of the processing depends on the type of the incoming message.
+ Put and get messages are subject to access control checks and translation
+ (searching a match list), while acknowledgement and reply messages bypass
+ the access control checks and the translation step.
+\layout Standard
+
+Acknowledgement messages include a handle for the memory descriptor used
+ in the original 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ This memory descriptor will identify the event queue where the event should
+ be recorded.
+ Upon receipt of an acknowledgement, the runtime system only needs to confirm
+ that the memory descriptor and event queue still exist and that there is
+ space for another event.
+ Should the any of these conditions fail,  the message is simply discarded
+ and the dropped message count for the interface is incremented.
+ Otherwise, the system builds an acknowledgement event from the information
+ in the acknowledgement message and adds it to the event queue.
+\layout Standard
+
+Reception of reply messages is also relatively straightforward.
+ Each reply message includes a handle for a memory descriptor.
+ If this descriptor exists, it is used to receive the message.
+ A reply message will be dropped if the memory descriptor identified in
+ the request doesn't exist.
+ In either of this case, the dropped message count for the interface is
+ incremented.
+ These are the only reasons for dropping reply messages.
+ Every memory descriptor accepts and truncates incoming reply messages,
+ eliminating the other potential reasons for rejecting a reply message.
+\layout Standard
+
+The critical step in processing an incoming put or get request involves
+ mapping the request to a memory descriptor.
+ This step starts by using the Portal index in the incoming request to identify
+ a list of match entries.
+ This list of match entries is searched in order until a match entry is
+ found whose match criteria matches the match bits in the incoming request
+ and whose memory descriptor accepts the request.
+\layout Standard
+
+Because acknowledge and reply messages are generated in response to requests
+ made by the process receiving these messages, the checks performed by the
+ runtime system for acknowledgements and replies are minimal.
+ In contrast, put and get messages are generated by remote processes and
+ the checks performed for these messages are more extensive.
+ Incoming put or get messages may be rejected because: 
+\layout Itemize
+
+the Portal index supplied in the request is not valid; 
+\layout Itemize
+
+the cookie supplied in the request is not a valid access control entry;
+\layout Itemize
+
+the access control entry identified by the cookie does not match the identifier
+ of the requesting process; 
+\layout Itemize
+
+the access control entry identified by the access control entry does not
+ match the Portal index supplied in the request; or 
+\layout Itemize
+
+the match bits supplied in the request do not match any of the match entries
+ with a memory descriptor that accepts the request.
+\layout Standard
+
+In all cases, if the message is rejected, the incoming message is discarded
+ and the dropped message count for the interface is incremented.
+\layout Standard
+
+A memory descriptor may reject an incoming request for any of the following
+ reasons: 
+\layout Itemize
+
+the 
+\family typewriter 
+PTL_MD_PUT
+\family default 
+ or 
+\family typewriter 
+PTL_MD_GET
+\family default 
+ option has not been enabled and the operation is put or get, respectively;
+\layout Itemize
+
+the length specified in the request is too long for the memory descriptor
+ and the 
+\family typewriter 
+PTL_MD_TRUNCATE
+\family default 
+ option has not been enabled.
+\layout Chapter
+
+Examples
+\begin_inset LatexCommand \label{sec:examples}
+
+\end_inset 
+
+
+\layout Comment
+
+The examples presented in this chapter have not been updated to reflect
+ the current API.
+\layout Standard
+
+In this section we present several example to illustrate expected usage
+ patterns for the Portals 3.2 API.
+ The first example describes how to implement parallel servers using the
+ features of the Portals 3.2 API.
+ This example covers the access control list and the use of remote managed
+ offsets.
+ The second example presents an approach to dealing with dropped requests.
+ This example covers aspects of match lists and memory descriptors.
+ The final example covers message reception in MPI.
+ This example illustrates more sophisticated uses of matching and a procedure
+ to update a memory descriptor.
+\layout Section
+
+Parallel File Servers
+\begin_inset LatexCommand \label{sec:expfs}
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:file}
+
+\end_inset 
+
+ illustrates the logical structure of a parallel file server.
+ In this case, the parallel server consists of four servers that stripe
+ application data across four disks.
+ We would like to present applications with the illusion that the file server
+ is a single entity.
+ We will assume that all of the processes that constitute the parallel server
+ have the same user id.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename file.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 196pt
+       lyxheight 147pt
+\end_inset 
+
+
+\layout Caption
+
+Parallel File Server
+\begin_inset LatexCommand \label{fig:file}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When an application establishes a connection to the parallel file server,
+ it will allocate a Portal and access control list entry for communicating
+ with the server.
+ The access control list entry will include the Portal and match any process
+ in the parallel file server's, so all of the file server processes will
+ have access to the portal.
+ The Portal information and access control entry will be sent to the file
+ server at this time.
+ If the application and server need to have multiple, concurrent I/O operations,
+ they can use additional portals or match entries to keep the operations
+ from interfering with one another.
+\layout Standard
+
+When an application initiates an I/O operation, it first builds a memory
+ descriptor that describes the memory region involved in the operation.
+ This memory descriptor will enable the appropriate operation (put for read
+ operations and get for write operations) and enable the use of remote offsets
+ (this lets the servers decide where their data should be placed in the
+ memory region).
+ After creating the memory descriptor and linking it into the appropriate
+ Portal entry, the application sends a read or write request (using 
+\emph on 
+PtlPut
+\emph default 
+) to one of the file server processes.
+ The file server processes can then use put or get operations with the appropria
+te offsets to fill or retrieve the contents of the application's buffer.
+ To know when the operation has completed, the application can add an event
+ queue to the memory descriptor and add up the lengths of the remote operations
+ until the sum is the size of the requested I/O operation.
+\layout Section
+
+Dealing with Dropped Requests
+\begin_inset LatexCommand \label{sec:exdrop}
+
+\end_inset 
+
+
+\layout Standard
+
+If a process does not anticipate unexpected requests, they will be discarded.
+ Applications using the Portals API can query the dropped count for the
+ interface to determine the number of requests that have been dropped (see
+ Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ While this approach minimizes resource consumption, it does not provide
+ information that might be critical in debugging the implementation of a
+ higher level protocol.
+\layout Standard
+
+To keep track of more information about dropped requests, we use a memory
+ descriptor that truncates each incoming request to zero bytes and logs
+ the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ operations in an event queue.
+ Note that the operations are not dropped in the Portals sense, because
+ the operation succeeds.
+\layout Standard
+
+The following code fragment illustrates an implementation of this approach.
+ In this case, we assume that a thread is launched to execute the function
+\family typewriter 
+watch_drop
+\family default 
+.
+ This code starts by building an event queue to log truncated operations
+ and a memory descriptor to truncate the incoming requests.
+ This example only captures 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests for a single portal.
+ In a more realistic situation, the memory descriptor would be appended
+ to the match list for every portal.
+ We also assume that the thread is capable of keeping up with the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests.
+ If this is not the case, we could use a finite threshold on the memory
+ descriptor to capture the first few dropped requests.
+\layout LyX-Code
+
+
+\size small 
+#include <stdio.h>
+\newline 
+#include <stdlib.h>
+\newline 
+#include <portals.h>
+\newline 
+
+\newline 
+#define DROP_SIZE 32       /* number of dropped requests to track */
+\newline 
+
+\newline 
+int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) {
+\newline 
+    ptl_handle_eq_t drop_events;
+\newline 
+    ptl_event_t event;
+\newline 
+    ptl_handle_md_t drop_em;
+\newline 
+    ptl_md_t drop_desc;
+\newline 
+    ptl_process_id_t any_proc;
+\newline 
+    ptl_handle_me_t match_any;
+\newline 
+
+\newline 
+    /* create the event queue */
+\newline 
+    if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the event queue
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* build a match entry */
+\newline 
+    any_proc.nid = PTL_ID_ANY;
+\newline 
+    any_proc.pid = PTL_ID_ANY;
+\newline 
+    PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN,
+\newline 
+                        &match_any );
+\newline 
+
+\newline 
+    /* create the memory descriptor */
+\newline 
+    drop_desc.start = NULL;
+\newline 
+    drop_desc.length = 0;
+\newline 
+    drop_desc.threshold = PTL_MD_THRESH_INF;
+\newline 
+    drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE;
+\newline 
+    drop_desc.user_ptr = NULL;
+\newline 
+    drop_desc.eventq = drop_events;
+\newline 
+    if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the memory descriptor
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* watch for "dropped" requests */
+\newline 
+    while( 1 ) {
+\newline 
+        if( PtlEQWait( drop_events, &event ) != PTL_OK ) break;
+\newline 
+        fprintf( stderr, "Dropped request from gid = event.initiator.gid,
+ event.initiator.rid );
+\newline 
+    }
+\newline 
+}
+\layout Section
+
+Message Transmission in MPI
+\begin_inset LatexCommand \label{sec:exmpi}
+
+\end_inset 
+
+
+\layout Standard
+
+We conclude this section with a fairly extensive example that describes
+ an approach to implementing message transmission for MPI.
+ Like many MPI implementations, we distinguish two message transmission
+ protocols: a short message protocol and a long message protocol.
+ We use the constant 
+\family typewriter 
+MPI_LONG_LENGTH
+\family default 
+ to determine the size of a long message.
+\layout Standard
+
+For small messages, the sender simply sends the message and presumes that
+ the message will be received (i.e., the receiver has allocated a memory region
+ to receive the message body).
+ For large messages, the sender also sends the message, but does not presume
+ that the message body will be saved.
+ Instead, the sender builds a memory descriptor for the message and enables
+ get operations on this descriptor.
+ If the target does not save the body of the message, it will record an
+ event for the put operation.
+ When the process later issues a matching MPI receive, it will perform a
+ get operation to retrieve the body of the message.
+\layout Standard
+
+To facilitate receive side matching based on the protocol, we use the most
+ significant bit in the match bits to indicate the protocol: 1 for long
+ messages and 0 for short messages.
+\layout Standard
+
+The following code presents a function that implements the send side of
+ the protocol.
+ The global variable 
+\family typewriter 
+EndGet
+\family default 
+ is the last match entry attached to the Portal index used for posting long
+ messages.
+ This entry does not match any incoming requests (i.e., the memory descriptor
+ rejects all get operations) and is built during initialization of the MPI
+ library.
+ The other global variable, 
+\family typewriter 
+MPI_NI
+\family default 
+, is a handle for the network interface used by the MPI implementation.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_me_t EndGet;
+\newline 
+extern ptl_handle_ni_t MPI_NI;
+\newline 
+
+\newline 
+void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq,
+\newline 
+                    ptl_process_id target, ptl_match_bits_t match ) 
+\newline 
+{
+\newline 
+    ptl_handle_md_t send_handle;
+\newline 
+    ptl_md_t mem_desc;
+\newline 
+    ptl_ack_req_t want_ack;
+\newline 
+
+\newline 
+    mem_desc.start = buf;
+\newline 
+    mem_desc.length = len;
+\newline 
+    mem_desc.threshold = 1;
+\newline 
+    mem_desc.options = PTL_MD_GET_OP;
+\newline 
+    mem_desc.user_ptr = data;
+\newline 
+    mem_desc.eventq = eventq;
+\newline 
+
+\newline 
+    if( len >= MPI_LONG_LENGTH ) {
+\newline 
+        ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+        /* add a match entry to the end of the get list */
+\newline 
+        PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet,
+ &me_handle );
+\newline 
+        PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL );
+\newline 
+
+\newline 
+        /* we want an ack for long messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a long message
+ */
+\newline 
+        match |= 1<<63;
+\newline 
+    } else {
+\newline 
+        /* we don't want an ack for short messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a short message
+ */
+\newline 
+        match &= ~(1<<63);
+\newline 
+    }
+\newline 
+
+\newline 
+   /* create a memory descriptor and send it */
+\newline 
+   PtlMDBind( MPI_NI, mem_desc, &send_handle );
+\newline 
+   PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match,
+ 0 );
+\newline 
+}
+\layout Standard
+
+The 
+\emph on 
+MPISend
+\emph default 
+ function returns as soon as the message has been scheduled for transmission.
+ The event queue argument, 
+\family typewriter 
+eventq
+\family default 
+, can be used to determine the disposition of the message.
+ Assuming that 
+\family typewriter 
+eventq
+\family default 
+ is not 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, a 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+ event will be recorded for each message as the message is transmitted.
+ For small messages, this is the only event that will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+ In contrast, long messages include an explicit request for an acknowledgement.
+ If the 
+\family typewriter 
+target
+\family default 
+ process has posted a matching receive, the acknowledgement will be sent
+ as the message is received.
+ If a matching receive has not been posted, the message will be discarded
+ and no acknowledgement will be sent.
+ When the 
+\family typewriter 
+target
+\family default 
+ process later issues a matching receive, the receive will be translated
+ into a get operation and a 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+ event will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:mpi}
+
+\end_inset 
+
+ illustrates the organization of the match list used for receiving MPI messages.
+ The initial entries (not shown in this figure) would be used to match the
+ MPI receives that have been preposted by the application.
+ The preposted receives are followed by a match entry, 
+\emph on 
+RcvMark
+\emph default 
+, that marks the boundary between preposted receives and the memory descriptors
+ used for 
+\begin_inset Quotes eld
+\end_inset 
+
+unexpected
+\begin_inset Quotes erd
+\end_inset 
+
+ messages.
+ The 
+\emph on 
+RcvMark
+\emph default 
+ entry is followed by a small collection of match entries that match unexpected
+\begin_inset Quotes eld
+\end_inset 
+
+short
+\begin_inset Quotes erd
+\end_inset 
+
+ messages, i.e., messages that have a 0 in the most significant bit of their
+ match bits.
+ The memory descriptors associated with these match entries will append
+ the incoming message to the associated memory descriptor and record an
+ event in an event queue for unexpected messages.
+ The unexpected short message matching entries are followed by a match entry
+ that will match messages that were not matched by the preceding match entries,
+ i.e., the unexpected long messages.
+ The memory descriptor associated with this match entry truncates the message
+ body and records an event in the event queue for unexpected messages.
+ Note that of the memory descriptors used for unexpected messages share
+ a common event queue.
+ This makes it possible to process the unexpected messages in the order
+ in which they arrived, regardless of.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename mpi.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 389pt
+       lyxheight 284pt
+\end_inset 
+
+
+\layout Caption
+
+Message Reception in MPI
+\begin_inset LatexCommand \label{fig:mpi}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When the local MPI process posts an MPI receive, we must first search the
+ events unexpected message queue to see if a matching message has already
+ arrived.
+ If no matching message is found, a match entry for the receive is inserted
+ before the 
+\emph on 
+RcvMark
+\emph default 
+ entry--after the match entries for all of the previously posted receives
+ and before the match entries for the unexpected messages.
+ This ensures that preposted receives are matched in the order that they
+ were posted (a requirement of MPI).
+\layout Standard
+
+While this strategy respects the temporal semantics of MPI, it introduces
+ a race condition: a matching message might arrive after the events in the
+ unexpected message queue have been searched, but before the match entry
+ for the receive has been inserted in the match list.
+\layout Standard
+
+To avoid this race condition we start by setting the 
+\family typewriter 
+threshold
+\family default 
+ of the memory descriptor to 0, making the descriptor inactive.
+ We then insert the match entry into the match list and proceed to search
+ the events in the unexpected message queue.
+ A matching message that arrives as we are searching the unexpected message
+ queue will not be accepted by the memory descriptor and, if not matched
+ by an earlier match list element, will add an event to the unexpected message
+ queue.
+ After searching the events in the unexpected message queue, we update the
+ memory descriptor, setting the threshold to 1 to activate the memory descriptor.
+ This update is predicated by the condition that the unexpected message
+ queue is empty.
+ We repeat the process of searching the unexpected message queue until the
+ update succeeds.
+\layout Standard
+
+The following code fragment illustrates this approach.
+ Because events must be removed from the unexpected message queue to be
+ examined, this code fragment assumes the existence of a user managed event
+ list, 
+\family typewriter 
+Rcvd
+\family default 
+, for the events that have already been removed from the unexpected message
+ queue.
+ In an effort to keep the example focused on the basic protocol, we have
+ omitted the code that would be needed to manage the memory descriptors
+ used for unexpected short messages.
+ In particular, we simply leave messages in these descriptors until they
+ are received by the application.
+ In a robust implementation, we would introduce code to ensure that short
+ unexpected messages are removed from these memory descriptors so that they
+ can be re-used.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_eq_t UnexpQueue;
+\newline 
+extern ptl_handle_me_t RcvMark;
+\newline 
+extern ptl_handle_me_t ShortMatch;
+\newline 
+
+\newline 
+typedef struct event_list_tag {
+\newline 
+    ptl_event_t            event;
+\newline 
+    struct event_list_tag* next;
+\newline 
+} event_list;
+\newline 
+
+\newline 
+extern event_list Rcvd;
+\newline 
+
+\newline 
+void AppendRcvd( ptl_event_t event )
+\newline 
+{
+\newline 
+    /* append an event onto the Rcvd list */
+\newline 
+}
+\newline 
+
+\newline 
+int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi
+ts_t match,
+\newline 
+                       ptl_match_bits_t ignore, ptl_event_t *event )
+\newline 
+{
+\newline 
+    /* Search the Rcvd event queue, looking for a message that matches the
+ requested message.
+\newline 
+     * If one is found, remove the event from the Rcvd list and return it.
+ */
+\newline 
+}
+\newline 
+
+\newline 
+typedef enum { RECEIVED, POSTED } receive_state;
+\newline 
+
+\newline 
+receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event,
+ ptl_md_t md_buf )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+    if( event.rlength >= MPI_LONG_LENGTH ) {
+\newline 
+        PtlMDBind( MPI_NI, md_buf, &md_handle );
+\newline 
+        PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX,
+ md_handle );
+\newline 
+        return POSTED;
+\newline 
+    } else {
+\newline 
+        /* copy the message */
+\newline 
+        if( event.mlength < *length ) *length = event.mlength;
+\newline 
+        memcpy( buf, (char*)event.md_desc.start+event.offset, *length );
+\newline 
+        return RECEIVED;
+\newline 
+    }
+\newline 
+}
+\newline 
+
+\newline 
+receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle
+_eq_t eventq, 
+\newline 
+                           ptl_process_id_t sender, ptl_match_bits_t match,
+ ptl_match_bits_t ignore )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_md_t md_handle;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+    ptl_event_t event;
+\newline 
+
+\newline 
+    /* build a memory descriptor for the receive */
+\newline 
+    md_buf.start = buf;
+\newline 
+    md_buf.length = *len;
+\newline 
+    md_buf.threshold = 0;     /* temporarily disabled */
+\newline 
+    md_buf.options = PTL_MD_PUT_OP;
+\newline 
+    md_buf.user_ptr = MPI_data;
+\newline 
+    md_buf.eventq = eventq;
+\newline 
+
+\newline 
+    /* see if we have already received the message */
+\newline 
+    if( SearchRcvd(buf, len, sender, match, ignore, &event) )
+\newline 
+         return CopyMsg( buf, len, event, md_buf );
+\newline 
+
+\newline 
+    /* create the match entry and attach the  memory descriptor */
+\newline 
+    PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark,
+ &me_handle);
+\newline 
+    PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle );
+\newline 
+
+\newline 
+    md_buf.threshold = 1;
+\newline 
+    do
+\newline 
+        if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) {
+\newline 
+            if( MPIMatch(event, match, ignore, sender) ) {
+\newline 
+                return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset,
+ md_buf );
+\newline 
+            } else {
+\newline 
+                AppendRcvd( event );
+\newline 
+            }
+\newline 
+        }
+\newline 
+    while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE
+ );
+\newline 
+    return POSTED;
+\newline 
+}
+\layout Chapter*
+
+Acknowledgments
+\layout Standard
+
+Several people have contributed to the philosophy, design, and implementation
+ of the Portals message passing architecture as it has evolved.
+ We acknowledge the following people for their contributions: Al Audette,
+ Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike
+ Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke,
+ Dave van Dresser, Lee Ward, and Stephen Wheat.
+\layout Standard
+
+
+\begin_inset LatexCommand \BibTeX[ieee]{portals3}
+
+\end_inset 
+
+
+\the_end
diff --git a/lnet/doc/put.fig b/lnet/doc/put.fig
new file mode 100644 (file)
index 0000000..5235b6d
--- /dev/null
@@ -0,0 +1,32 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1350 900 2175 1200
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1275 2700 1725
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 1200
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2699 1788 899 1938
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001
+4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001
+4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am
new file mode 100644 (file)
index 0000000..2cf7f99
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = portals linux
+EXTRA_DIST = config.h.in
+include $(top_srcdir)/Rules
diff --git a/lnet/include/config.h.in b/lnet/include/config.h.in
new file mode 100644 (file)
index 0000000..b05d0c4
--- /dev/null
@@ -0,0 +1,11 @@
+/* ../include/config.h.in.  Generated automatically from configure.in by autoheader.  */
+
+/* Define if you have the readline library (-lreadline).  */
+#undef HAVE_LIBREADLINE
+
+/* Name of package */
+#undef PACKAGE
+
+/* Version number of package */
+#undef VERSION
+
diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..6a65cb5
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(top_srcdir)/Rules
+
+linuxincludedir = $(includedir)/linux
+
+linuxinclude_HEADERS=kp30.h portals_lib.h
diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h
new file mode 100644 (file)
index 0000000..4915fe3
--- /dev/null
@@ -0,0 +1,936 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _KP30_INCLUDED
+#define _KP30_INCLUDED
+
+
+#define PORTAL_DEBUG
+
+#ifndef offsetof
+# define offsetof(typ,memb)    ((int)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define LOWEST_BIT_SET(x)      ((x) & ~((x) - 1))
+
+#ifndef CONFIG_SMP
+# define smp_processor_id() 0
+#endif
+
+/*
+ *  Debugging
+ */
+extern unsigned int portal_subsystem_debug;
+extern unsigned int portal_stack;
+extern unsigned int portal_debug;
+extern unsigned int portal_printk;
+/* Debugging subsystems  (8 bit ID)
+ *
+ * If you add debug subsystem #32, you need to send email to phil, because
+ * you're going to break kernel subsystem debug filtering. */
+#define S_UNDEFINED    (0 << 24)
+#define S_MDC          (1 << 24)
+#define S_MDS          (2 << 24)
+#define S_OSC          (3 << 24)
+#define S_OST          (4 << 24)
+#define S_CLASS        (5 << 24)
+#define S_OBDFS        (6 << 24) /* obsolete */
+#define S_LLITE        (7 << 24)
+#define S_RPC          (8 << 24)
+#define S_EXT2OBD      (9 << 24) /* obsolete */
+#define S_PORTALS     (10 << 24)
+#define S_SOCKNAL     (11 << 24)
+#define S_QSWNAL      (12 << 24)
+#define S_PINGER      (13 << 24)
+#define S_FILTER      (14 << 24)
+#define S_TRACE       (15 << 24) /* obsolete */
+#define S_ECHO        (16 << 24)
+#define S_LDLM        (17 << 24)
+#define S_LOV         (18 << 24)
+#define S_GMNAL       (19 << 24)
+#define S_PTLROUTER   (20 << 24)
+#define S_COBD        (21 << 24)
+#define S_PTLBD       (22 << 24)
+#define S_LOG         (23 << 24)
+
+/* If you change these values, please keep portals/linux/utils/debug.c
+ * up to date! */
+
+/* Debugging masks (24 bits, non-overlapping) */
+#define D_TRACE     (1 << 0) /* ENTRY/EXIT markers */
+#define D_INODE     (1 << 1)
+#define D_SUPER     (1 << 2)
+#define D_EXT2      (1 << 3) /* anything from ext2_debug */
+#define D_MALLOC    (1 << 4) /* print malloc, free information */
+#define D_CACHE     (1 << 5) /* cache-related items */
+#define D_INFO      (1 << 6) /* general information */
+#define D_IOCTL     (1 << 7) /* ioctl related information */
+#define D_BLOCKS    (1 << 8) /* ext2 block allocation */
+#define D_NET       (1 << 9) /* network communications */
+#define D_WARNING   (1 << 10)
+#define D_BUFFS     (1 << 11)
+#define D_OTHER     (1 << 12)
+#define D_DENTRY    (1 << 13)
+#define D_PORTALS   (1 << 14) /* ENTRY/EXIT markers */
+#define D_PAGE      (1 << 15) /* bulk page handling */
+#define D_DLMTRACE  (1 << 16)
+#define D_ERROR     (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG     (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA        (1 << 19) /* recovery and failover */
+#define D_RPCTRACE  (1 << 20) /* for distributed debugging */
+#define D_VFSTRACE  (1 << 21)
+
+#ifndef THREAD_SIZE
+#define THREAD_SIZE 8192
+#endif
+#ifdef  __arch_ia64__
+#define CDEBUG_STACK(var) (&var & (THREAD_SIZE - 1))
+#else
+#define CDEBUG_STACK(var) (THREAD_SIZE -                                      \
+                           ((unsigned long)__builtin_frame_address(0)&        \
+                            (THREAD_SIZE - 1)))
+#endif
+
+#ifdef __KERNEL__
+#define CHECK_STACK(stack)                                                    \
+        do {                                                                  \
+                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack)      \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR,           \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          (stack),                            \
+                                          "maximum lustre stack %u\n",        \
+                                          portal_stack = (stack));            \
+        } while (0)
+#else
+#define CHECK_STACK(stack) do{}while(0)
+#endif
+
+#define CDEBUG(mask, format, a...)                                            \
+do {                                                                          \
+        unsigned long stack = CDEBUG_STACK(stack);                            \
+        int match = 0;                                                        \
+                                                                              \
+        CHECK_STACK(stack);                                                   \
+        if (!(mask))                                                          \
+                match = 1;                                                    \
+        else if ((mask) & (D_ERROR | D_EMERG))                                \
+                match = 1;                                                    \
+        else if (portal_debug & (mask) &&                                     \
+                 portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24)))     \
+                match = 1;                                                    \
+        if (match)                                                            \
+                portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  stack, format , ## a);                      \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
+
+#define GOTO(label, rc)                                                 \
+do {                                                                    \
+        long GOTO__ret = (long)(rc);                                    \
+        CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \
+               #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\
+               (signed long)GOTO__ret);                                 \
+        goto label;                                                     \
+} while (0)
+
+#define RETURN(rc)                                                      \
+do {                                                                    \
+        typeof(rc) RETURN__ret = (rc);                                  \
+        long tmp = (long)RETURN__ret;                                   \
+        CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n",       \
+               (unsigned long)tmp, (signed long)tmp,                    \
+               (signed long)tmp);                                       \
+        return RETURN__ret;                                             \
+} while (0)
+
+#define ENTRY                                                           \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process entered\n");                           \
+} while (0)
+
+#define EXIT                                                            \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process leaving\n");                           \
+} while(0)
+
+
+#ifdef __KERNEL__
+# include <linux/vmalloc.h>
+# include <linux/time.h>
+# include <linux/slab.h>
+# include <linux/interrupt.h>
+# include <linux/highmem.h>
+# include <linux/module.h>
+# include <linux/version.h>
+# include <portals/lib-nal.h>
+# include <linux/smp_lock.h>
+# include <asm/atomic.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define schedule_work schedule_task
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_TQUEUE((wq), 0, 0);                                              \
+        PREPARE_TQUEUE((wq), (cb), (cbdata));                                 \
+} while (0)
+
+#define ll_invalidate_inode_pages invalidate_inode_pages
+#define PageUptodate Page_Uptodate
+#define our_recalc_sigpending(current) recalc_sigpending(current)
+#define num_online_cpus() smp_num_cpus
+static inline void our_cond_resched(void)
+{
+        if (current->need_resched)
+               schedule ();
+}
+
+#else
+
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_WORK((wq), (void *)(cb), (void *)(cbdata));                      \
+} while (0)
+#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping)
+#define wait_on_page wait_on_page_locked
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+static inline void our_cond_resched(void)
+{
+        cond_resched();
+}
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */
+
+#ifdef PORTAL_DEBUG
+extern void kportal_assertion_failed(char *expr,char *file,char *func,int line);
+#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__,  \
+                                                        __FUNCTION__, __LINE__))
+#else
+#define LASSERT(e)
+#endif
+
+#ifdef __arch_um__
+#define LBUG()                                                          \
+do {                                                                    \
+        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__);      \
+        panic("LBUG");                                                  \
+} while (0)
+#else
+#define LBUG()                                                          \
+do {                                                                    \
+        CEMERG("LBUG\n");                                               \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__);      \
+        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
+        schedule();                                                     \
+} while (0)
+#endif /* __arch_um__ */
+
+/*
+ * Memory
+ */
+#ifdef PORTAL_DEBUG
+extern atomic_t portal_kmemory;
+
+# define portal_kmem_inc(ptr, size)                                           \
+do {                                                                          \
+        atomic_add(size, &portal_kmemory);                                    \
+} while (0)
+
+# define portal_kmem_dec(ptr, size) do {                                      \
+        atomic_sub(size, &portal_kmemory);                                    \
+} while (0)
+
+#else
+# define portal_kmem_inc(ptr, size) do {} while (0)
+# define portal_kmem_dec(ptr, size) do {} while (0)
+#endif /* PORTAL_DEBUG */
+
+#define PORTAL_VMALLOC_SIZE        16384
+
+#define PORTAL_ALLOC(ptr, size)                                           \
+do {                                                                      \
+        long s = size;                                                    \
+        LASSERT (!in_interrupt());                                        \
+        if (s > PORTAL_VMALLOC_SIZE)                                      \
+                (ptr) = vmalloc(s);                                       \
+        else                                                              \
+                (ptr) = kmalloc(s, GFP_KERNEL);                           \
+        if ((ptr) == NULL)                                                \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s);    \
+        else {                                                            \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_FREE(ptr, size)                                          \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        if (s > PORTAL_VMALLOC_SIZE)                                    \
+                vfree(ptr);                                             \
+        else                                                            \
+                kfree(ptr);                                             \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+#define PORTAL_SLAB_ALLOC(ptr, slab, size)                                \
+do {                                                                      \
+        long s = (size);                                                  \
+        LASSERT (!in_interrupt());                                        \
+        (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL);                    \
+        if ((ptr) == NULL) {                                              \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' from slab '" #slab "')\n", __FILE__,  \
+                       __LINE__);                                         \
+        } else {                                                          \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_SLAB_FREE(ptr, slab, size)                               \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        memset((ptr), 0x5a, s);                                         \
+        kmem_cache_free((slab), ptr);                                   \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+/* ------------------------------------------------------------------- */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
+#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x)
+
+#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x))
+#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
+
+#define PORTAL_MODULE_USE       MOD_INC_USE_COUNT
+#define PORTAL_MODULE_UNUSE     MOD_DEC_USE_COUNT
+#else
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+#define PORTAL_SYMBOL_GET(x) symbol_get(x)
+#define PORTAL_SYMBOL_PUT(x) symbol_put(x)
+
+#define PORTAL_MODULE_USE       try_module_get(THIS_MODULE)
+#define PORTAL_MODULE_UNUSE     module_put(THIS_MODULE)
+
+#endif
+
+/******************************************************************************/
+/* Kernel Portals Router interface */
+
+typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback
+
+/* space for routing targets to stash "stuff" in a forwarded packet */
+typedef union {
+        long long        _alignment;
+        void            *_space[16];            /* scale with CPU arch */
+} kprfd_scratch_t;
+
+/* Kernel Portals Routing Forwarded message Descriptor */
+typedef struct {
+        struct list_head     kprfd_list;        /* stash in queues (routing target can use) */
+        ptl_nid_t            kprfd_target_nid;  /* final destination NID */
+        ptl_nid_t            kprfd_gateway_nid; /* gateway NID */
+        int                  kprfd_nob;         /* # message bytes (including header) */
+        int                  kprfd_niov;        /* # message frags (including header) */
+        struct iovec        *kprfd_iov;         /* message fragments */
+        void                *kprfd_router_arg;  // originating NAL's router arg
+        kpr_fwd_callback_t   kprfd_callback;    /* completion callback */
+        void                *kprfd_callback_arg; /* completion callback arg */
+        kprfd_scratch_t      kprfd_scratch;    // scratchpad for routing targets
+} kpr_fwd_desc_t;
+
+typedef void  (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+
+/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
+typedef const struct {
+        int             kprni_nalid;    /* NAL's id */
+        void           *kprni_arg;      /* Arg to pass when calling into NAL */
+        kpr_fwd_t       kprni_fwd;      /* NAL's forwarding entrypoint */
+} kpr_nal_interface_t;
+
+/* Router's routing interface (Kernel Portals Routing Router Interface) */
+typedef const struct {
+        /* register the calling NAL with the router and get back the handle for
+         * subsequent calls */
+        int     (*kprri_register) (kpr_nal_interface_t *nal_interface,
+                                   void **router_arg);
+
+        /* ask the router to find a gateway that forwards to 'nid' and is a peer
+         * of the calling NAL */
+        int     (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+                                 ptl_nid_t *gateway_nid);
+
+        /* hand a packet over to the router for forwarding */
+        kpr_fwd_t kprri_fwd_start;
+
+        /* hand a packet back to the router for completion */
+        void    (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
+                                   int error);
+
+        /* the calling NAL is shutting down */
+        void    (*kprri_shutdown) (void *router_arg);
+
+        /* deregister the calling NAL with the router */
+        void    (*kprri_deregister) (void *router_arg);
+
+} kpr_router_interface_t;
+
+/* Convenient struct for NAL to stash router interface/args */
+typedef struct {
+        kpr_router_interface_t  *kpr_interface;
+        void                    *kpr_arg;
+} kpr_router_t;
+
+/* Router's control interface (Kernel Portals Routing Control Interface) */
+typedef const struct {
+        int     (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
+                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+        int     (*kprci_del_route)(ptl_nid_t nid);
+        int     (*kprci_get_route)(int index, int *gateway_nal,
+                                   ptl_nid_t *gateway, ptl_nid_t *lo_nid,
+                                   ptl_nid_t *hi_nid);
+} kpr_control_interface_t;
+
+extern kpr_control_interface_t  kpr_control_interface;
+extern kpr_router_interface_t   kpr_router_interface;
+
+static inline int
+kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif)
+{
+        int    rc;
+
+        router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface);
+        if (router->kpr_interface == NULL)
+                return (-ENOENT);
+
+        rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg);
+        if (rc != 0)
+                router->kpr_interface = NULL;
+
+        PORTAL_SYMBOL_PUT (kpr_router_interface);
+        return (rc);
+}
+
+static inline int
+kpr_routing (kpr_router_t *router)
+{
+        return (router->kpr_interface != NULL);
+}
+
+static inline int
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+{
+        if (!kpr_routing (router))
+                return (-EHOSTUNREACH);
+
+        return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+                                                    gateway_nid));
+}
+
+static inline void
+kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, 
+              int nob, int niov, struct iovec *iov, 
+              kpr_fwd_callback_t callback, void *callback_arg)
+{
+        fwd->kprfd_target_nid   = nid;
+        fwd->kprfd_gateway_nid  = nid;
+        fwd->kprfd_nob          = nob;
+        fwd->kprfd_niov         = niov;
+        fwd->kprfd_iov          = iov;
+        fwd->kprfd_callback     = callback;
+        fwd->kprfd_callback_arg = callback_arg;
+}
+
+static inline void
+kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
+{
+        if (!kpr_routing (router))
+                fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+        else
+                router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
+}
+
+static inline void
+kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error)
+{
+        LASSERT (kpr_routing (router));
+        router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error);
+}
+
+static inline void
+kpr_shutdown (kpr_router_t *router)
+{
+        if (kpr_routing (router))
+                router->kpr_interface->kprri_shutdown (router->kpr_arg);
+}
+
+static inline void
+kpr_deregister (kpr_router_t *router)
+{
+        if (!kpr_routing (router))
+                return;
+        router->kpr_interface->kprri_deregister (router->kpr_arg);
+        router->kpr_interface = NULL;
+}
+
+/******************************************************************************/
+
+#ifdef PORTALS_PROFILING
+#define prof_enum(FOO) PROF__##FOO
+enum {
+        prof_enum(our_recvmsg),
+        prof_enum(our_sendmsg),
+        prof_enum(socknal_recv),
+        prof_enum(lib_parse),
+        prof_enum(conn_list_walk),
+        prof_enum(memcpy),
+        prof_enum(lib_finalize),
+        prof_enum(pingcli_time),
+        prof_enum(gmnal_send),
+        prof_enum(gmnal_recv),
+        MAX_PROFS
+};
+
+struct prof_ent {
+        char *str;
+        /* hrmph.  wrap-tastic. */
+        u32       starts;
+        u32       finishes;
+        cycles_t  total_cycles;
+        cycles_t  start;
+        cycles_t  end;
+};
+
+extern struct prof_ent prof_ents[MAX_PROFS];
+
+#define PROF_START(FOO)                                         \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->starts++;                                   \
+                pe->start = get_cycles();                       \
+        } while (0)
+
+#define PROF_FINISH(FOO)                                        \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->finishes++;                                 \
+                pe->end = get_cycles();                         \
+                pe->total_cycles += (pe->end - pe->start);      \
+        } while (0)
+#else /* !PORTALS_PROFILING */
+#define PROF_START(FOO) do {} while(0)
+#define PROF_FINISH(FOO) do {} while(0)
+#endif /* PORTALS_PROFILING */
+
+/* debug.c */
+void portals_run_lbug_upcall(char * file, char *fn, int line);
+void portals_debug_dumplog(void);
+int portals_debug_init(unsigned long bufsize);
+int portals_debug_cleanup(void);
+int portals_debug_clear_buffer(void);
+int portals_debug_mark_buffer(char *text);
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                char *file, unsigned int size);
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len);
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                        unsigned long stack, const char *format, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#else
+void portals_debug_msg (int subsys, int mask, char *file, char *fn,
+                        int line, unsigned long stack,
+                        const char *format, ...);
+#endif /* __GNUC__ */
+void portals_debug_set_level(unsigned int debug_level);
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+extern void kportal_daemonize (char *name);
+extern void kportal_blockallsigs (void);
+
+#else  /* !__KERNEL__ */
+# include <stdio.h>
+# include <stdlib.h>
+#ifndef __CYGWIN__
+# include <stdint.h>
+#endif
+# include <unistd.h>
+# include <time.h>
+# include <asm/types.h>
+# ifndef DEBUG_SUBSYSTEM
+#  define DEBUG_SUBSYSTEM S_UNDEFINED
+# endif
+# ifdef PORTAL_DEBUG
+#  undef NDEBUG
+#  include <assert.h>
+#  define LASSERT(e)   assert(e)
+# else
+#  define LASSERT(e)
+# endif
+# define printk(format, args...) printf (format, ## args)
+# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
+# define PORTAL_FREE(a, b) do { free(a); } while (0);
+# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \
+    printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
+            (subsys) >> 24, (mask), (long)time(0), file, fn, line,            \
+            getpid() , stack, ## a);
+#endif
+
+#ifndef CURRENT_TIME
+# define CURRENT_TIME time(0)
+#endif
+
+#include <linux/portals_lib.h>
+
+/*
+ * USER LEVEL STUFF BELOW
+ */
+
+#define PORTAL_IOCTL_VERSION 0x00010007
+#define PING_SYNC       0
+#define PING_ASYNC      1
+
+struct portal_ioctl_data {
+        __u32 ioc_len;
+        __u32 ioc_version;
+        __u64 ioc_nid;
+        __u64 ioc_nid2;
+        __u64 ioc_nid3;
+        __u32 ioc_count;
+        __u32 ioc_nal;
+        __u32 ioc_nal_cmd;
+        __u32 ioc_fd;
+        __u32 ioc_id;
+
+        __u32 ioc_flags;
+        __u32 ioc_size;
+
+        __u32 ioc_wait;
+        __u32 ioc_timeout;
+        __u32 ioc_misc;
+
+        __u32 ioc_inllen1;
+        char *ioc_inlbuf1;
+        __u32 ioc_inllen2;
+        char *ioc_inlbuf2;
+
+        __u32 ioc_plen1; /* buffers in userspace */
+        char *ioc_pbuf1;
+        __u32 ioc_plen2; /* buffers in userspace */
+        char *ioc_pbuf2;
+
+        char ioc_bulk[0];
+};
+
+struct portal_ioctl_hdr {
+        __u32 ioc_len;
+        __u32 ioc_version;
+};
+
+struct portals_debug_ioctl_data
+{
+        struct portal_ioctl_hdr hdr;
+        unsigned int subs;
+        unsigned int debug;
+};
+
+#define PORTAL_IOC_INIT(data)                           \
+do {                                                    \
+        memset(&data, 0, sizeof(data));                 \
+        data.ioc_version = PORTAL_IOCTL_VERSION;        \
+        data.ioc_len = sizeof(data);                    \
+} while (0)
+
+/* FIXME check conflict with lustre_lib.h */
+#define PTL_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+
+static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
+{
+        int len = sizeof(*data);
+        len += size_round(data->ioc_inllen1);
+        len += size_round(data->ioc_inllen2);
+        return len;
+}
+
+static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data)
+{
+        if (data->ioc_len > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+                CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+                CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf1 && !data->ioc_plen1) {
+                CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf2 && !data->ioc_plen2) {
+                CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_plen1 && !data->ioc_pbuf1) {
+                CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+                return 1;
+        }
+        if (data->ioc_plen2 && !data->ioc_pbuf2) {
+                CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+                return 1;
+        }
+        if (portal_ioctl_packlen(data) != data->ioc_len ) {
+                CERROR ("PORTALS ioctl: packlen != ioc_len\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 &&
+            data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 &&
+            data->ioc_bulk[size_round(data->ioc_inllen1) +
+                           data->ioc_inllen2 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n");
+                return 1;
+        }
+        return 0;
+}
+
+#ifndef __KERNEL__
+static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
+                                    int max)
+{
+        char *ptr;
+        struct portal_ioctl_data *overlay;
+        data->ioc_len = portal_ioctl_packlen(data);
+        data->ioc_version = PORTAL_IOCTL_VERSION;
+
+        if (*pbuf && portal_ioctl_packlen(data) > max)
+                return 1;
+        if (*pbuf == NULL) {
+                *pbuf = malloc(data->ioc_len);
+        }
+        if (!*pbuf)
+                return 1;
+        overlay = (struct portal_ioctl_data *)*pbuf;
+        memcpy(*pbuf, data, sizeof(*data));
+
+        ptr = overlay->ioc_bulk;
+        if (data->ioc_inlbuf1)
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        if (data->ioc_inlbuf2)
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        if (portal_ioctl_is_invalid(overlay))
+                return 1;
+
+        return 0;
+}
+#else
+#include <asm/uaccess.h>
+
+/* buffer MUST be at least the size of portal_ioctl_hdr */
+static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct portal_ioctl_hdr *hdr;
+        struct portal_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct portal_ioctl_hdr *)buf;
+        data = (struct portal_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
+                CERROR ("PORTALS: version mismatch kernel vs application\n");
+                return -EINVAL;
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
+                return -EINVAL;
+        }
+
+
+        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
+                CERROR ("PORTALS: user buffer too small for ioctl\n");
+                return -EINVAL;
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (portal_ioctl_is_invalid(data)) {
+                CERROR ("PORTALS: ioctl not correctly formatted\n");
+                return -EINVAL;
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+        }
+
+        EXIT;
+        return 0;
+}
+#endif
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_PORTAL_TYPE                   'e'
+#define IOC_PORTAL_MIN_NR                 30
+
+#define IOC_PORTAL_PING                    _IOWR('e', 30, long)
+#define IOC_PORTAL_GET_DEBUG               _IOWR('e', 31, long)
+#define IOC_PORTAL_CLEAR_DEBUG             _IOWR('e', 32, long)
+#define IOC_PORTAL_MARK_DEBUG              _IOWR('e', 33, long)
+#define IOC_PORTAL_PANIC                   _IOWR('e', 34, long)
+#define IOC_PORTAL_ADD_ROUTE               _IOWR('e', 35, long)
+#define IOC_PORTAL_DEL_ROUTE               _IOWR('e', 36, long)
+#define IOC_PORTAL_GET_ROUTE               _IOWR('e', 37, long)
+#define IOC_PORTAL_NAL_CMD                _IOWR('e', 38, long)
+#define IOC_PORTAL_GET_NID                 _IOWR('e', 39, long)
+#define IOC_PORTAL_FAIL_NID                _IOWR('e', 40, long)
+#define IOC_PORTAL_SET_DAEMON              _IOWR('e', 41, long)
+
+#define IOC_PORTAL_MAX_NR               41
+
+enum {
+        QSWNAL  =  1,
+        SOCKNAL,
+        GMNAL,
+        TOENAL,
+        TCPNAL,
+        SCIMACNAL,
+        NAL_ENUM_END_MARKER
+};
+
+#ifdef __KERNEL__
+extern ptl_handle_ni_t  kqswnal_ni;
+extern ptl_handle_ni_t  ksocknal_ni;
+extern ptl_handle_ni_t  ktoenal_ni;
+extern ptl_handle_ni_t  kgmnal_ni;
+extern ptl_handle_ni_t  kscimacnal_ni;
+#endif
+
+#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
+
+#define NAL_CMD_REGISTER_PEER_FD     100
+#define NAL_CMD_CLOSE_CONNECTION     101
+#define NAL_CMD_REGISTER_MYNID       102
+#define NAL_CMD_PUSH_CONNECTION      103
+
+enum {
+        DEBUG_DAEMON_START       =  1,
+        DEBUG_DAEMON_STOP        =  2,
+        DEBUG_DAEMON_PAUSE       =  3,
+        DEBUG_DAEMON_CONTINUE    =  4,
+};
+
+/* XXX remove to lustre ASAP */
+struct lustre_peer {
+        ptl_nid_t       peer_nid;
+        ptl_handle_ni_t peer_ni;
+};
+
+/* module.c */
+typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private);
+int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
+int kportal_nal_unregister(int nal);
+
+ptl_handle_ni_t *kportal_get_ni (int nal);
+void kportal_put_ni (int nal);
+
+#ifdef __CYGWIN__
+#ifndef BITS_PER_LONG
+#if (~0UL) == 0xffffffffUL
+#define BITS_PER_LONG 32
+#else 
+#define BITS_PER_LONG 64
+#endif
+#endif
+#endif
+
+#if (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%Lu"
+# define LPD64 "%Ld"
+# define LPX64 "%#Lx"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#if (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%lu"
+# define LPD64 "%ld"
+# define LPX64 "%#lx"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lnet/include/linux/portals_lib.h b/lnet/include/linux/portals_lib.h
new file mode 100644 (file)
index 0000000..a528a80
--- /dev/null
@@ -0,0 +1,188 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _PORTALS_LIB_H
+#define _PORTALS_LIB_H
+
+#ifndef __KERNEL__
+# include <string.h>
+#else 
+# include <asm/types.h>
+#endif
+
+#undef MIN
+#define MIN(a,b) (((a)<(b)) ? (a): (b))
+#undef MAX
+#define MAX(a,b) (((a)>(b)) ? (a): (b))
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+
+static inline int size_round0(int val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t round_strlen(char *fset)
+{
+        return size_round(strlen(fset) + 1);
+}
+
+#ifdef __KERNEL__
+static inline char *strdup(const char *str)
+{
+        int len = strlen(str) + 1;
+        char *tmp = kmalloc(len, GFP_KERNEL);
+        if (tmp)
+                memcpy(tmp, str, len);
+
+        return tmp;
+}
+#endif
+
+#ifdef __KERNEL__
+# define NTOH__u32(var) le32_to_cpu(var)
+# define NTOH__u64(var) le64_to_cpu(var)
+# define HTON__u32(var) cpu_to_le32(var)
+# define HTON__u64(var) cpu_to_le64(var)
+#else
+# define expansion_u64(var) \
+    ({  __u64 ret; \
+       switch (sizeof(var)) {   \
+       case 8: (ret) = (var); break; \
+       case 4: (ret) = (__u32)(var); break; \
+       case 2: (ret) = (__u16)(var); break; \
+       case 1: (ret) = (__u8)(var); break; \
+       };       \
+       (ret);     \
+    })
+# define NTOH__u32(var) (var)
+# define NTOH__u64(var) (expansion_u64(var))
+# define HTON__u32(var) (var)
+# define HTON__u64(var) (expansion_u64(var))
+#endif
+
+/* 
+ * copy sizeof(type) bytes from pointer to var and move ptr forward.
+ * return EFAULT if pointer goes beyond end
+ */
+#define UNLOGV(var,type,ptr,end)                \
+do {                                            \
+        var = *(type *)ptr;                     \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* the following two macros convert to little endian */
+/* type MUST be __u32 or __u64 */
+#define LUNLOGV(var,type,ptr,end)               \
+do {                                            \
+        var = NTOH##type(*(type *)ptr);         \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* now log values */
+#define LOGV(var,type,ptr)                      \
+do {                                            \
+        *((type *)ptr) = var;                   \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* and in network order */
+#define LLOGV(var,type,ptr)                     \
+do {                                            \
+        *((type *)ptr) = HTON##type(var);       \
+        ptr += sizeof(type);                    \
+} while (0)
+
+
+/* 
+ * set var to point at (type *)ptr, move ptr forward with sizeof(type)
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGP(var,type,ptr,end)                \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define LOGP(var,type,ptr)                      \
+do {                                            \
+        memcpy(ptr, var, sizeof(type));         \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* 
+ * set var to point at (char *)ptr, move ptr forward by size_round(len);
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGL(var,type,len,ptr,end)            \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += size_round(len * sizeof(type));  \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define UNLOGL0(var,type,len,ptr,end)                                   \
+do {                                                                    \
+        UNLOGL(var,type,len,ptr,end);                                   \
+        if ( *((char *)ptr - size_round(len) + len - 1) != '\0')        \
+                return -EFAULT;                                         \
+} while (0)
+
+#define LOGL(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)ptr, (const char *)var, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGU(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)var, (const char *)ptr, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGL0(var,len,ptr)                              \
+do {                                                    \
+        if (!len)                                       \
+                break;                                  \
+        memcpy((char *)ptr, (const char *)var, len);    \
+        *((char *)(ptr) + len) = 0;                     \
+        ptr += size_round(len + 1);                     \
+} while (0)
+
+#endif /* _PORTALS_LIB_H */
diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..c61b084
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = base
+include $(top_srcdir)/Rules
+
+pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h
+
diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h
new file mode 100644 (file)
index 0000000..af4a2dc
--- /dev/null
@@ -0,0 +1,27 @@
+# define DEBUG_SUBSYSTEM S_PORTALS
+# define PORTAL_DEBUG
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+
+#include <portals/internal.h>
+#include <portals/nal.h>
+#include <portals/arg-blocks.h>
+
+/* Hack for 2.4.18 macro name collision */
+#ifdef yield
+#undef yield
+#endif
diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h
new file mode 100644 (file)
index 0000000..a83749b
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef P30_API_H
+#define P30_API_H
+
+#include <portals/types.h>
+
+#ifndef PTL_NO_WRAP
+int PtlInit(void);
+int PtlInitialized(void);
+void PtlFini(void);
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in,
+              ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * interface_out);
+
+int PtlNIInitialized(ptl_interface_t);
+
+int PtlNIFini(ptl_handle_ni_t interface_in);
+
+#endif
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
+
+
+/*
+ * Network interfaces
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlNIBarrier(ptl_handle_ni_t interface_in);
+#endif
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out);
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out);
+
+#ifndef PTL_NO_WRAP
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
+#endif
+
+
+/*
+ * PtlNIDebug: 
+ *
+ * This is not an official Portals 3 API call.  It is provided
+ * by the reference implementation to allow the maintainers an
+ * easy way to turn on and off debugging information in the
+ * library.  Do not use it in code that is not intended for use
+ * with any version other than the portable reference library.
+ */
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
+
+/* 
+ * PtlNIFailNid
+ *
+ * Not an official Portals 3 API call.  It provides a way of simulating
+ * communications failures to all (nid == PTL_NID_ANY), or specific peers
+ * (via multiple calls), either until further notice (threshold == -1), or
+ * for a specific number of messages.  Passing a threshold of zero, "heals"
+ * the given peer.
+ */
+int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
+
+
+/*
+ * Match entries
+ */
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out);
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out);
+
+int PtlMEUnlink(ptl_handle_me_t current_in);
+
+int PtlMEUnlinkList(ptl_handle_me_t current_in);
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in);
+int PtlMEDump(ptl_handle_me_t current_in);
+
+
+
+/*
+ * Memory descriptors
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+              ptl_handle_md_t * handle_out);
+
+int PtlMDUnlink(ptl_handle_md_t md_in);
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
+
+#endif
+
+/* These should not be called by users */
+int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                         ptl_md_t * new_inout, ptl_handle_eq_t testq_in,
+                         ptl_seq_t sequence_in);
+
+
+
+
+/*
+ * Event queues
+ */
+#ifndef PTL_NO_WRAP
+
+/* These should be called by users */
+int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out);
+int PtlEQFree(ptl_handle_eq_t eventq_in);
+
+int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out);
+
+int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout);
+#endif
+
+/*
+ * Access Control Table
+ */
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in);
+
+
+/*
+ * Data movement
+ */
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in);
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in);
+
+
+
+#endif
diff --git a/lnet/include/lnet/arg-blocks.h b/lnet/include/lnet/arg-blocks.h
new file mode 100644 (file)
index 0000000..3c3b154
--- /dev/null
@@ -0,0 +1,265 @@
+#ifndef PTL_BLOCKS_H
+#define PTL_BLOCKS_H
+
+/*
+ * blocks.h
+ *
+ * Argument block types for the Portals 3.0 library
+ * Generated by idl
+ *
+ */
+
+#include <portals/types.h>
+
+/* put LIB_MAX_DISPATCH last here  -- these must match the
+   assignements to the dispatch table in lib-p30/dispatch.c */
+#define PTL_GETID     1
+#define PTL_NISTATUS  2
+#define PTL_NIDIST    3
+#define PTL_NIDEBUG   4
+#define PTL_MEATTACH  5
+#define PTL_MEINSERT  6
+// #define PTL_MEPREPEND 7
+#define PTL_MEUNLINK  8
+#define PTL_TBLDUMP   9 
+#define PTL_MEDUMP   10
+#define PTL_MDATTACH 11
+// #define PTL_MDINSERT 12
+#define PTL_MDBIND   13
+#define PTL_MDUPDATE 14
+#define PTL_MDUNLINK 15
+#define PTL_EQALLOC  16
+#define PTL_EQFREE   17
+#define PTL_ACENTRY  18
+#define PTL_PUT      19 
+#define PTL_GET      20
+#define PTL_FAILNID  21
+#define LIB_MAX_DISPATCH 21
+
+typedef struct PtlFailNid_in {
+       ptl_handle_ni_t interface;
+       ptl_nid_t       nid;
+       unsigned int    threshold;
+} PtlFailNid_in;
+
+typedef struct PtlFailNid_out {
+       int             rc;
+} PtlFailNid_out;
+
+typedef struct PtlGetId_in {
+        ptl_handle_ni_t handle_in;
+} PtlGetId_in;
+
+typedef struct PtlGetId_out {
+        int rc;
+        ptl_process_id_t id_out;
+} PtlGetId_out;
+
+typedef struct PtlNIStatus_in {
+        ptl_handle_ni_t interface_in;
+        ptl_sr_index_t register_in;
+} PtlNIStatus_in;
+
+typedef struct PtlNIStatus_out {
+        int rc;
+        ptl_sr_value_t status_out;
+} PtlNIStatus_out;
+
+
+typedef struct PtlNIDist_in {
+        ptl_handle_ni_t interface_in;
+        ptl_process_id_t process_in;
+} PtlNIDist_in;
+
+typedef struct PtlNIDist_out {
+        int rc;
+        unsigned long distance_out;
+} PtlNIDist_out;
+
+
+typedef struct PtlNIDebug_in {
+        unsigned int mask_in;
+} PtlNIDebug_in;
+
+typedef struct PtlNIDebug_out {
+        unsigned int rc;
+} PtlNIDebug_out;
+
+
+typedef struct PtlMEAttach_in {
+        ptl_handle_ni_t interface_in;
+        ptl_pt_index_t index_in;
+        ptl_ins_pos_t position_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+} PtlMEAttach_in;
+
+typedef struct PtlMEAttach_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEAttach_out;
+
+
+typedef struct PtlMEInsert_in {
+        ptl_handle_me_t current_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+        ptl_ins_pos_t position_in;
+} PtlMEInsert_in;
+
+typedef struct PtlMEInsert_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEInsert_out;
+
+typedef struct PtlMEUnlink_in {
+        ptl_handle_me_t current_in;
+        ptl_unlink_t unlink_in;
+} PtlMEUnlink_in;
+
+typedef struct PtlMEUnlink_out {
+        int rc;
+} PtlMEUnlink_out;
+
+
+typedef struct PtlTblDump_in {
+        int index_in;
+} PtlTblDump_in;
+
+typedef struct PtlTblDump_out {
+        int rc;
+} PtlTblDump_out;
+
+
+typedef struct PtlMEDump_in {
+        ptl_handle_me_t current_in;
+} PtlMEDump_in;
+
+typedef struct PtlMEDump_out {
+        int rc;
+} PtlMEDump_out;
+
+
+typedef struct PtlMDAttach_in {
+        ptl_handle_me_t me_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+        ptl_unlink_t unlink_in;
+} PtlMDAttach_in;
+
+typedef struct PtlMDAttach_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDAttach_out;
+
+
+typedef struct PtlMDBind_in {
+        ptl_handle_ni_t ni_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+} PtlMDBind_in;
+
+typedef struct PtlMDBind_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDBind_out;
+
+
+typedef struct PtlMDUpdate_internal_in {
+        ptl_handle_md_t md_in;
+        ptl_handle_eq_t testq_in;
+        ptl_seq_t sequence_in;
+
+        ptl_md_t old_inout;
+        int old_inout_valid;
+        ptl_md_t new_inout;
+        int new_inout_valid;
+} PtlMDUpdate_internal_in;
+
+typedef struct PtlMDUpdate_internal_out {
+        int rc;
+        ptl_md_t old_inout;
+        ptl_md_t new_inout;
+} PtlMDUpdate_internal_out;
+
+
+typedef struct PtlMDUnlink_in {
+        ptl_handle_md_t md_in;
+} PtlMDUnlink_in;
+
+typedef struct PtlMDUnlink_out {
+        int rc;
+        ptl_md_t status_out;
+} PtlMDUnlink_out;
+
+
+typedef struct PtlEQAlloc_in {
+        ptl_handle_ni_t ni_in;
+        ptl_size_t count_in;
+        void *base_in;
+        int len_in;
+        int (*callback_in) (ptl_event_t * event);
+} PtlEQAlloc_in;
+
+typedef struct PtlEQAlloc_out {
+        int rc;
+        ptl_handle_eq_t handle_out;
+} PtlEQAlloc_out;
+
+
+typedef struct PtlEQFree_in {
+        ptl_handle_eq_t eventq_in;
+} PtlEQFree_in;
+
+typedef struct PtlEQFree_out {
+        int rc;
+} PtlEQFree_out;
+
+
+typedef struct PtlACEntry_in {
+        ptl_handle_ni_t ni_in;
+        ptl_ac_index_t index_in;
+        ptl_process_id_t match_id_in;
+        ptl_pt_index_t portal_in;
+} PtlACEntry_in;
+
+typedef struct PtlACEntry_out {
+        int rc;
+} PtlACEntry_out;
+
+
+typedef struct PtlPut_in {
+        ptl_handle_md_t md_in;
+        ptl_ack_req_t ack_req_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+        ptl_hdr_data_t hdr_data_in;
+} PtlPut_in;
+
+typedef struct PtlPut_out {
+        int rc;
+} PtlPut_out;
+
+
+typedef struct PtlGet_in {
+        ptl_handle_md_t md_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+} PtlGet_in;
+
+typedef struct PtlGet_out {
+        int rc;
+} PtlGet_out;
+
+
+#endif
diff --git a/lnet/include/lnet/defines.h b/lnet/include/lnet/defines.h
new file mode 100644 (file)
index 0000000..285f7e0
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+** $Id: defines.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+**
+** This files contains definitions that are used throughout the cplant code.
+*/
+
+#ifndef CPLANT_H
+#define CPLANT_H
+
+#define TITLE(fname,zmig)
+
+
+/*
+** TRUE and FALSE
+*/
+#undef TRUE
+#define TRUE           (1)
+#undef FALSE
+#define FALSE          (0)
+
+
+/*
+** Return codes from functions
+*/
+#undef OK
+#define OK             (0)
+#undef ERROR
+#define ERROR          (-1)
+
+
+
+/*
+** The GCC macro for a safe max() that works on all types arithmetic types.
+*/
+#ifndef MAX
+#define MAX(a, b)      (a) > (b) ? (a) : (b)
+#endif /* MAX */
+
+#ifndef MIN
+#define MIN(a, b)      (a) < (b) ? (a) : (b)
+#endif /* MIN */
+
+/*
+** The rest is from the old qkdefs.h
+*/
+
+#ifndef __linux__
+#define __inline__
+#endif
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#ifndef __osf__
+#define PRIVATE static
+#define PUBLIC
+#endif
+
+#ifndef __osf__
+typedef unsigned char           uchar;
+#endif
+
+typedef char                    CHAR;
+typedef unsigned char           UCHAR;
+typedef char                    INT8;
+typedef unsigned char           UINT8;
+typedef short int               INT16;
+typedef unsigned short int      UINT16;
+typedef int                     INT32;
+typedef unsigned int            UINT32;
+typedef long                    LONG32;
+typedef unsigned long           ULONG32;
+
+/* long may be 32 or 64, so we can't really append the size to the definition */
+typedef long                    LONG;
+typedef unsigned long           ULONG;
+
+#ifdef __alpha__
+typedef long int_t;
+#ifndef __osf__
+typedef unsigned long uint_t;
+#endif
+#endif
+
+#ifdef __i386__
+typedef int int_t;
+typedef unsigned int uint_t;
+#endif
+
+typedef float                   FLOAT32;
+typedef double                  FLOAT64;
+typedef void                    VOID;
+typedef INT32                   BOOLEAN;
+typedef void (*FCN_PTR)(void);
+
+#ifndef off64_t
+
+#if defined (__alpha__) || defined (__ia64__)
+typedef long                     off64_t;
+#else
+typedef long long                off64_t;
+#endif
+
+#endif
+
+/*
+** Process related typedefs
+*/
+typedef UINT16 PID_TYPE;  /* Type of Local process ID */
+typedef UINT16 NID_TYPE;  /* Type of Physical node ID */
+typedef UINT16 GID_TYPE;  /* Type of Group ID */
+typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */
+
+
+
+#endif /* CPLANT_H */
diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h
new file mode 100644 (file)
index 0000000..817936a
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _P30_ERRNO_H_
+#define _P30_ERRNO_H_
+
+/*
+ * include/portals/errno.h
+ *
+ * Shared error number lists
+ */
+
+/* If you change these, you must update the string table in api-errno.c */
+typedef enum {
+        PTL_OK              = 0,
+        PTL_SEGV            = 1,
+
+        PTL_NOSPACE         = 2,
+        PTL_INUSE           = 3,
+        PTL_VAL_FAILED      = 4,
+
+        PTL_NAL_FAILED      = 5,
+        PTL_NOINIT          = 6,
+        PTL_INIT_DUP        = 7,
+        PTL_INIT_INV        = 8,
+        PTL_AC_INV_INDEX    = 9,
+
+        PTL_INV_ASIZE       = 10,
+        PTL_INV_HANDLE      = 11,
+        PTL_INV_MD          = 12,
+        PTL_INV_ME          = 13,
+        PTL_INV_NI          = 14,
+/* If you change these, you must update the string table in api-errno.c */
+        PTL_ILL_MD          = 15,
+        PTL_INV_PROC        = 16,
+        PTL_INV_PSIZE       = 17,
+        PTL_INV_PTINDEX     = 18,
+        PTL_INV_REG         = 19,
+
+        PTL_INV_SR_INDX     = 20,
+        PTL_ML_TOOLONG      = 21,
+        PTL_ADDR_UNKNOWN    = 22,
+        PTL_INV_EQ          = 23,
+        PTL_EQ_DROPPED      = 24,
+
+        PTL_EQ_EMPTY        = 25,
+        PTL_NOUPDATE        = 26,
+        PTL_FAIL            = 27,
+        PTL_NOT_IMPLEMENTED = 28,
+        PTL_NO_ACK          = 29,
+
+        PTL_IOV_TOO_MANY    = 30,
+        PTL_IOV_TOO_SMALL   = 31,
+
+       PTL_EQ_INUSE        = 32,
+       PTL_MD_INUSE        = 33,
+
+        PTL_MAX_ERRNO       = 33
+} ptl_err_t;
+/* If you change these, you must update the string table in api-errno.c */
+
+extern const char *ptl_err_str[];
+
+#endif
diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h
new file mode 100644 (file)
index 0000000..8ade444
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+** $Id: internal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+#ifndef _P30_INTERNAL_H_
+#define _P30_INTERNAL_H_
+
+/*
+ * p30/internal.h
+ *
+ * Internals for the API level library that are not needed
+ * by the user application
+ */
+
+#include <portals/p30.h>
+
+extern int ptl_init;           /* Has the library be initialized */
+
+extern int ptl_ni_init(void);
+extern int ptl_me_init(void);
+extern int ptl_md_init(void);
+extern int ptl_eq_init(void);
+
+extern int ptl_me_ni_init(nal_t * nal);
+extern int ptl_md_ni_init(nal_t * nal);
+extern int ptl_eq_ni_init(nal_t * nal);
+
+extern void ptl_ni_fini(void);
+extern void ptl_me_fini(void);
+extern void ptl_md_fini(void);
+extern void ptl_eq_fini(void);
+
+extern void ptl_me_ni_fini(nal_t * nal);
+extern void ptl_md_ni_fini(nal_t * nal);
+extern void ptl_eq_ni_fini(nal_t * nal);
+
+static inline ptl_eq_t *
+ptl_handle2usereq (ptl_handle_eq_t *handle)
+{
+        /* EQ handles are a little wierd.  On the "user" side, the cookie
+         * is just a pointer to a queue of events in shared memory.  It's
+         * cb_eq_handle is the "real" handle which we pass when we
+         * call do_forward(). */
+        return (ptl_eq_t *)((unsigned long)handle->cookie);
+}
+
+#endif
diff --git a/lnet/include/lnet/lib-dispatch.h b/lnet/include/lnet/lib-dispatch.h
new file mode 100644 (file)
index 0000000..7e5d73d
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef PTL_DISPATCH_H
+#define PTL_DISPATCH_H
+
+/*
+ * include/dispatch.h
+ *
+ * Dispatch table header and externs for remote side
+ * operations
+ *
+ * Generated by idl
+ *
+ */
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args,
+                           void *ret);
+extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args,
+                                  void *ret);
+extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlACEntry(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret);
+
+extern char *dispatch_name(int index);
+#endif
diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h
new file mode 100644 (file)
index 0000000..ec3393b
--- /dev/null
@@ -0,0 +1,383 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab; 
+extern kmem_cache_t *ptl_msg_slab; 
+extern kmem_cache_t *ptl_me_slab; 
+extern kmem_cache_t *ptl_eq_slab; 
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL);
+        
+        if (eq == NULL)
+                return (NULL);
+        
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void 
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq); 
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); 
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL);
+
+        if (me == NULL)
+                return (NULL);
+        
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); 
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h
new file mode 100644 (file)
index 0000000..4052c0c
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef _LIB_NAL_H_
+#define _LIB_NAL_H_
+
+/*
+ * nal.h
+ *
+ * Library side headers that define the abstraction layer's
+ * responsibilities and interfaces
+ */
+
+#include <portals/lib-types.h>
+
+struct nal_cb_t {
+       /*
+        * Per interface portal table, access control table
+        * and NAL private data field;
+        */
+       lib_ni_t ni;
+       void *nal_data;
+       /*
+        * send:  Sends a preformatted header and user data to a
+        * specified remote process.
+        * Can overwrite iov.
+        */
+       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                       unsigned int niov, struct iovec *iov, size_t mlen);
+
+       /* as send, but with a set of page fragments (NULL if not supported) */
+       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       /*
+        * recv: Receives an incoming message from a remote process
+        * Type of iov depends on options.  Can overwrite iov.
+        */
+       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                       unsigned int niov, struct iovec *iov, size_t mlen, 
+                       size_t rlen);
+
+       /* as recv, but with a set of page fragments (NULL if not supported) */
+       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen, 
+                             size_t rlen);
+       /*
+        * read: Reads a block of data from a specified user address
+        */
+       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                       user_ptr src_addr, size_t len);
+
+       /*
+        * write: Writes a block of data into a specified user address
+        */
+       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                        void *src_addr, size_t len);
+
+       /*
+        * callback: Calls an event callback
+        */
+       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev);
+
+       /*
+        *  malloc: Acquire a block of memory in a system independent
+        * fashion.
+        */
+       void *(*cb_malloc) (nal_cb_t * nal, size_t len);
+
+       void (*cb_free) (nal_cb_t * nal, void *buf, size_t len);
+
+       /*
+        * (un)map: Tell the NAL about some memory it will access.
+        * *addrkey passed to cb_unmap() is what cb_map() set it to.
+        * type of *iov depends on options.
+        * Set to NULL if not required.
+        */
+       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                      void **addrkey);
+       void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                         void **addrkey);
+
+       /* as (un)map, but with a set of page fragments */
+       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                            void **addrkey);
+       void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                         void **addrkey);
+
+       void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...);
+
+       /* Turn interrupts off (begin of protected area) */
+       void (*cb_cli) (nal_cb_t * nal, unsigned long *flags);
+
+       /* Turn interrupts on (end of protected area) */
+       void (*cb_sti) (nal_cb_t * nal, unsigned long *flags);
+
+       /*
+        * Calculate a network "distance" to given node
+        */
+       int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist);
+};
+
+#endif
diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h
new file mode 100644 (file)
index 0000000..ec3393b
--- /dev/null
@@ -0,0 +1,383 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab; 
+extern kmem_cache_t *ptl_msg_slab; 
+extern kmem_cache_t *ptl_me_slab; 
+extern kmem_cache_t *ptl_eq_slab; 
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL);
+        
+        if (eq == NULL)
+                return (NULL);
+        
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void 
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq); 
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); 
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL);
+
+        if (me == NULL)
+                return (NULL);
+        
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); 
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
new file mode 100644 (file)
index 0000000..08ea118
--- /dev/null
@@ -0,0 +1,273 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * p30/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef _LIB_TYPES_H_
+#define _LIB_TYPES_H_
+
+#include <portals/types.h>
+#ifdef __KERNEL__
+# define PTL_USE_SLAB_CACHE
+# include <linux/uio.h>
+# include <linux/smp_lock.h>
+# include <linux/types.h>
+#else
+# include <sys/types.h>
+#endif
+
+/* struct nal_cb_t is defined in lib-nal.h */
+typedef struct nal_cb_t nal_cb_t;
+
+typedef char *user_ptr;
+typedef struct lib_msg_t lib_msg_t;
+typedef struct lib_ptl_t lib_ptl_t;
+typedef struct lib_ac_t lib_ac_t;
+typedef struct lib_me_t lib_me_t;
+typedef struct lib_md_t lib_md_t;
+typedef struct lib_eq_t lib_eq_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+        __u64 wh_interface_cookie;
+        __u64 wh_object_cookie;
+} ptl_handle_wire_t;
+
+/* byte-flip insensitive! */
+#define PTL_WIRE_HANDLE_NONE \
+((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
+
+typedef enum {
+        PTL_MSG_ACK = 0,
+        PTL_MSG_PUT,
+        PTL_MSG_GET,
+        PTL_MSG_REPLY,
+        PTL_MSG_HELLO,
+} ptl_msg_type_t;
+
+/* Each of these structs should start with an odd number of
+ * __u32, or the compiler could add its own padding and confuse
+ * everyone.
+ *
+ * Also, "length" needs to be at offset 28 of each struct.
+ */
+typedef struct ptl_ack {
+        ptl_size_t mlength;
+        ptl_handle_wire_t dst_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for acks) moving out RSN */
+} ptl_ack_t;
+
+typedef struct ptl_put {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t ack_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length moving out RSN */
+        ptl_size_t offset;
+        ptl_hdr_data_t hdr_data;
+} ptl_put_t;
+
+typedef struct ptl_get {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t return_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for gets) moving out RSN */
+        ptl_size_t src_offset;
+        ptl_size_t return_offset;               /* unused: going RSN */
+        ptl_size_t sink_length;
+} ptl_get_t;
+
+typedef struct ptl_reply {
+        __u32 unused1;                          /* unused fields going RSN */
+        ptl_handle_wire_t dst_wmd;
+        ptl_size_t dst_offset;                  /* unused: going RSN */
+        __u32 unused2;
+        ptl_size_t length;                      /* common length moving out RSN */
+} ptl_reply_t;
+
+typedef struct {
+        ptl_nid_t dest_nid;
+        ptl_nid_t src_nid;
+        ptl_pid_t dest_pid;
+        ptl_pid_t src_pid;
+        __u32 type; /* ptl_msg_type_t */
+        union {
+                ptl_ack_t ack;
+                ptl_put_t put;
+                ptl_get_t get;
+                ptl_reply_t reply;
+        } msg;
+} ptl_hdr_t;
+
+/* All length fields in individual unions at same offset */
+/* LASSERT for same in lib-move.c */
+#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length)
+
+/* A HELLO message contains the portals magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * PTL_MSG_HELLO in the type field.  All other fields are zero (including
+ * PTL_HDR_LENGTH; i.e. no payload).
+ * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID, so that hosts with
+ * multiple IP interfaces can have a single NID. These NALs should exchange
+ * HELLO messages when a connection is first established. */
+typedef struct {
+        __u32  magic;                          /* PORTALS_PROTO_MAGIC */
+        __u16   version_major;                  /* increment on incompatible change */
+        __u16   version_minor;                  /* increment on compatible change */
+} ptl_magicversion_t;
+
+#define PORTALS_PROTO_MAGIC                0xeebc0ded
+
+#define PORTALS_PROTO_VERSION_MAJOR        0
+#define PORTALS_PROTO_VERSION_MINOR        1
+
+typedef struct {
+        long recv_count, recv_length, send_count, send_length, drop_count,
+            drop_length, msgs_alloc, msgs_max;
+} lib_counters_t;
+
+/* temporary expedient: limit number of entries in discontiguous MDs */
+#if PTL_LARGE_MTU
+# define PTL_MD_MAX_IOV        64
+#else
+# define PTL_MD_MAX_IOV 16
+#endif
+
+struct lib_msg_t {
+        struct list_head  msg_list;
+        int               send_ack;
+        lib_md_t         *md;
+        ptl_nid_t         nid;
+        ptl_pid_t         pid;
+        ptl_event_t       ev;
+        ptl_handle_wire_t ack_wmd;
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } msg_iov;
+};
+
+struct lib_ptl_t {
+        ptl_pt_index_t size;
+        struct list_head *tbl;
+};
+
+struct lib_ac_t {
+        int next_free;
+};
+
+typedef struct {
+        struct list_head  lh_hash_chain;
+        __u64             lh_cookie;
+} lib_handle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+struct lib_eq_t {
+        struct list_head  eq_list;
+        lib_handle_t      eq_lh;
+        ptl_seq_t         sequence;
+        ptl_size_t        size;
+        ptl_event_t      *base;
+        int               eq_refcount;
+        int (*event_callback) (ptl_event_t * event);
+        void             *eq_addrkey;
+};
+
+struct lib_me_t {
+        struct list_head  me_list;
+        lib_handle_t      me_lh;
+        ptl_process_id_t  match_id;
+        ptl_match_bits_t  match_bits, ignore_bits;
+        ptl_unlink_t      unlink;
+        lib_md_t         *md;
+};
+
+struct lib_md_t {
+        struct list_head  md_list;
+        lib_handle_t      md_lh;
+        lib_me_t         *me;
+        user_ptr          start;
+        ptl_size_t        offset;
+        ptl_size_t        length;
+        ptl_size_t        max_size;
+        int               threshold;
+        int               pending;
+        ptl_unlink_t      unlink;
+        unsigned int      options;
+        unsigned int      md_flags;
+        void             *user_ptr;
+        lib_eq_t         *eq;
+        void             *md_addrkey;
+        unsigned int      md_niov;                /* # frags */
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } md_iov;
+};
+
+#define PTL_MD_FLAG_UNLINK            (1 << 0)
+#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
+
+#ifndef PTL_USE_SLAB_CACHE
+typedef struct
+{
+        void             *fl_objs;             /* single contiguous array of objects */
+        int                fl_nobjs;            /* the number of them */
+        int                fl_objsize;          /* the size (including overhead) of each of them */
+        struct list_head   fl_list;             /* where they are enqueued */
+} lib_freelist_t;
+
+typedef struct
+{
+        struct list_head   fo_list;             /* enqueue on fl_list */
+        void              *fo_contents;         /* aligned contents */
+} lib_freeobj_t;
+#endif
+
+typedef struct {
+        /* info about peers we are trying to fail */
+        struct list_head  tp_list;             /* stash in ni.ni_test_peers */
+        ptl_nid_t         tp_nid;              /* matching nid */
+        unsigned int      tp_threshold;        /* # failures to simulate */
+} lib_test_peer_t;
+
+typedef struct {
+        int up;
+        int refcnt;
+        ptl_nid_t nid;
+        ptl_pid_t pid;
+        int num_nodes;
+        unsigned int debug;
+        lib_ptl_t tbl;
+        lib_ac_t ac;
+        lib_counters_t counters;
+
+        int               ni_lh_hash_size;      /* size of lib handle hash table */
+        struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
+        __u64             ni_next_object_cookie; /* cookie generator */
+        __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
+        
+        struct list_head ni_test_peers;
+        
+#ifndef PTL_USE_SLAB_CACHE
+        lib_freelist_t   ni_free_mes;
+        lib_freelist_t   ni_free_msgs;
+        lib_freelist_t   ni_free_mds;
+        lib_freelist_t   ni_free_eqs;
+#endif
+        struct list_head ni_active_msgs;
+        struct list_head ni_active_mds;
+        struct list_head ni_active_eqs;
+} lib_ni_t;
+
+#endif
diff --git a/lnet/include/lnet/list.h b/lnet/include/lnet/list.h
new file mode 100644 (file)
index 0000000..41613ab
--- /dev/null
@@ -0,0 +1,246 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+                             struct list_head * prev,
+                             struct list_head * next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+               pos = pos->next, prefetch(pos->next))
+
+/**
+ * list_for_each_prev  -       iterate over a list in reverse order
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
+               pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+#endif
+
+#ifndef list_for_each_entry
+/**
+ * list_for_each_entry  -       iterate over list of given type
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    prefetch(pos->member.next);                        \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.next, typeof(*pos), member),  \
+            prefetch(pos->member.next))
+#endif
+
+#ifndef list_for_each_entry_safe
+/**
+ * list_for_each_entry_safe  -       iterate over list of given type safe against removal of list entry
+ * @pos:        the type * to use as a loop counter.
+ * @n:          the &struct list_head to use as temporary storage
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    n = pos->member.next;                              \
+            &pos->member != (head);                                    \
+            pos = list_entry(n, typeof(*pos), member),                 \
+            n = pos->member.next)
+#endif
diff --git a/lnet/include/lnet/lltrace.h b/lnet/include/lnet/lltrace.h
new file mode 100644 (file)
index 0000000..7d1b304
--- /dev/null
@@ -0,0 +1,175 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Compile with:
+ * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl 
+ */
+#ifndef __LTRACE_H_
+#define __LTRACE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <portals/types.h>
+#include <portals/ptlctl.h>
+#include <linux/kp30.h>
+#include <linux/limits.h>
+#include <asm/page.h>
+#include <linux/version.h>
+
+static inline int ltrace_write_file(char* fname)
+{
+        char* argv[3];
+
+        argv[0] = "debug_kernel";
+        argv[1] = fname;
+        argv[2] = "1";
+        
+        fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]);
+        
+        return jt_dbg_debug_kernel(3, argv);
+}
+
+static inline int ltrace_clear()
+{
+        char* argv[1];
+        
+        argv[0] = "clear";
+        
+        fprintf(stderr, "[ptlctl] %s\n", argv[0]);
+        
+        return jt_dbg_clear_debug_buf(1, argv);
+}
+
+static inline int ltrace_mark(int indent_level, char* text)
+{
+        char* argv[2];
+        char mark_buf[PATH_MAX];
+        
+        snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text);
+        
+        argv[0] = "mark";
+        argv[1] = mark_buf;
+        return jt_dbg_mark_debug_buf(2, argv);
+}
+
+static inline int ltrace_applymasks()
+{
+        char* argv[2];
+        argv[0] = "list";
+        argv[1] = "applymasks";
+        
+        fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]);
+        
+        return jt_dbg_list(2, argv);
+}
+
+
+static inline int ltrace_filter(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "filter";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_filter(2, argv);
+}
+
+static inline int ltrace_show(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "show";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_show(2, argv);
+}
+
+static inline int ltrace_start()
+{
+        int rc = 0;
+        dbg_initialize(0, NULL);
+#ifdef PORTALS_DEV_ID
+        rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+#endif
+        ltrace_filter("class"); 
+        ltrace_filter("socknal");
+        ltrace_filter("qswnal"); 
+        ltrace_filter("gmnal");  
+        ltrace_filter("portals");  
+        
+        ltrace_show("all_types");  
+        ltrace_filter("trace");  
+        ltrace_filter("malloc"); 
+        ltrace_filter("net"); 
+        ltrace_filter("page"); 
+        ltrace_filter("other"); 
+        ltrace_filter("info"); 
+        ltrace_applymasks();
+
+        return rc;
+}
+
+
+static inline void ltrace_stop()
+{
+#ifdef PORTALS_DEV_ID
+        unregister_ioc_dev(PORTALS_DEV_ID);
+#endif
+}
+
+static inline int not_uml()
+{
+  /* Return Values:
+   *   0 when run under UML
+   *   1 when run on host
+   *  <0 when lookup failed
+   */
+       struct stat buf;
+       int rc = stat("/dev/ubd", &buf);
+       rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
+       if (rc<0) {
+         fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
+         rc = 1; /* Assume host */
+       }
+       return rc;
+}
+
+#define LTRACE_MAX_NOB   256
+static inline void ltrace_add_processnames(char* fname)
+{
+        char cmdbuf[LTRACE_MAX_NOB];
+        struct timeval tv;
+        struct timezone tz;
+        int nob;
+        int underuml = !not_uml();
+        
+        gettimeofday(&tv, &tz);
+
+        nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \"");
+
+        /* Careful - these format strings need to match the CDEBUG
+         * formats in portals/linux/debug.c EXACTLY
+         */
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ",
+                        S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec);
+
+        if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d | %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L);
+        }
+        else {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0L);
+        }
+         
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname);
+        system(cmdbuf);
+}
+
+#endif
diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h
new file mode 100644 (file)
index 0000000..fdaae69
--- /dev/null
@@ -0,0 +1,74 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h
new file mode 100644 (file)
index 0000000..6a61fd5
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+** $Id: myrnal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+
+#ifndef MYRNAL_H
+#define MYRNAL_H
+
+#define MAX_ARGS_LEN            (256)
+#define MAX_RET_LEN             (128)
+#define MYRNAL_MAX_ACL_SIZE     (64)
+#define MYRNAL_MAX_PTL_SIZE     (64)
+
+#define P3CMD                   (100)
+#define P3SYSCALL               (200)
+#define P3REGISTER              (300)
+
+enum { PTL_MLOCKALL };
+
+typedef struct {
+       void *args;
+       size_t args_len;
+       void *ret;
+       size_t ret_len;
+       int p3cmd;
+} myrnal_forward_t;
+
+#endif                         /* MYRNAL_H */
diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h
new file mode 100644 (file)
index 0000000..c1c50ed
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+** $Id: nal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+#ifndef _NAL_H_
+#define _NAL_H_
+
+/*
+ * p30/nal.h
+ *
+ * The API side NAL declarations
+ */
+
+#include <portals/types.h>
+
+#ifdef yield
+#undef yield
+#endif
+
+typedef struct nal_t nal_t;
+
+struct nal_t {
+       ptl_ni_t ni;
+       int refct;
+       void *nal_data;
+       int *timeout;           /* for libp30api users */
+       int (*forward) (nal_t * nal, int index, /* Function ID */
+                       void *args, size_t arg_len, void *ret, size_t ret_len);
+
+       int (*shutdown) (nal_t * nal, int interface);
+
+       int (*validate) (nal_t * nal, void *base, size_t extent);
+
+       void (*yield) (nal_t * nal);
+
+       void (*lock) (nal_t * nal, unsigned long *flags);
+
+       void (*unlock) (nal_t * nal, unsigned long *flags);
+};
+
+typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+
+extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any);
+
+#ifndef PTL_IFACE_DEFAULT
+#define PTL_IFACE_DEFAULT (PTL_IFACE_IP)
+#endif
+
+#endif
diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h
new file mode 100644 (file)
index 0000000..1b837b4
--- /dev/null
@@ -0,0 +1,4 @@
+#define PTL_IFACE_TCP 1
+#define PTL_IFACE_ER 2
+#define PTL_IFACE_SS 3
+#define PTL_IFACE_MAX 4
diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lnet/include/lnet/ppid.h b/lnet/include/lnet/ppid.h
new file mode 100644 (file)
index 0000000..34e5dc5
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * TITLE(ppid_h, "@(#) $Id: ppid.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $");
+ */
+
+#ifndef _INCppidh_
+#define _INCppidh_
+
+#include "defines.h"
+// #include "idtypes.h"
+
+
+#define MAX_PPID         1000    /* this needs to fit into 16 bits so the 
+                                    maximum value is 65535. having it "large"
+                                    can help w/ debugging process accounting
+                                    but there are reasons for making it 
+                                    somewhat smaller than the maximum --
+                                    requiring storage for arrays that index 
+                                    on the ppid, eg...  */
+                                 
+#define MAX_GID          1000    /* this needs to fit into 16 bits... */
+
+#define MAX_FIXED_PPID   100
+#define MAX_FIXED_GID    100
+#define PPID_FLOATING    MAX_FIXED_PPID+1   /* Floating area starts here */
+#define GID_FLOATING     MAX_FIXED_GID+1    /* Floating area starts here */
+#define NUM_PTL_TASKS    MAX_FIXED_PPID+80  /* Maximum no. portals tasks */
+
+#define PPID_AUTO        0
+
+/* Minimum PPID is 1 */
+#define PPID_BEBOPD      1            /* bebopd */
+#define  GID_BEBOPD      1            /* bebopd */
+
+#define PPID_PCT         2            /* pct */
+#define  GID_PCT         2            /* pct */
+
+#define PPID_FYOD        3            /* fyod */
+#define  GID_FYOD        3            /* fyod */
+
+#define PPID_GDBWRAP     11           /* portals proxy for gdb */
+#define  GID_GDBWRAP     11           /* portals proxy for gdb */
+
+#define PPID_TEST        15           /* for portals tests */
+#define  GID_TEST        15
+
+#define  GID_YOD         5            /* yod */
+#define  GID_PINGD       6            /* pingd */
+#define  GID_BT          7            /* bt */
+#define  GID_PTLTEST     8            /* ptltest */
+#define  GID_CGDB        9            /* cgdb */
+#define  GID_TVDSVR     10            /* start-tvdsvr */
+
+#endif /* _INCppidh_ */
diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h
new file mode 100644 (file)
index 0000000..fdaae69
--- /dev/null
@@ -0,0 +1,74 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h
new file mode 100644 (file)
index 0000000..65ab189
--- /dev/null
@@ -0,0 +1,6 @@
+/*
+** $Id: stringtab.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+/*
+ * stringtab.h
+ */
diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h
new file mode 100644 (file)
index 0000000..d4038b6
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef _P30_TYPES_H_
+#define _P30_TYPES_H_
+
+#ifdef __linux__
+#include <asm/types.h>
+#include <asm/timex.h>
+#else
+#include <sys/types.h>
+typedef u_int32_t __u32;
+typedef u_int64_t __u64;
+typedef unsigned long long cycles_t;
+static inline cycles_t get_cycles(void) { return 0; }
+#endif
+
+typedef __u64 ptl_nid_t;
+typedef __u32 ptl_pid_t;
+typedef __u32 ptl_pt_index_t;
+typedef __u32 ptl_ac_index_t;
+typedef __u64 ptl_match_bits_t;
+typedef __u64 ptl_hdr_data_t;
+typedef __u32 ptl_size_t;
+
+typedef struct {
+        unsigned long nal_idx;                 /* which network interface */
+        __u64         cookie;                  /* which thing on that interface */
+} ptl_handle_any_t;
+
+typedef ptl_handle_any_t ptl_handle_ni_t;
+typedef ptl_handle_any_t ptl_handle_eq_t;
+typedef ptl_handle_any_t ptl_handle_md_t;
+typedef ptl_handle_any_t ptl_handle_me_t;
+
+#define PTL_HANDLE_NONE \
+((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
+#define PTL_EQ_NONE PTL_HANDLE_NONE
+
+static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+{
+       return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
+}
+
+#define PTL_NID_ANY      ((ptl_nid_t) -1)
+#define PTL_PID_ANY      ((ptl_pid_t) -1)
+
+typedef struct {
+        ptl_nid_t nid;
+        ptl_pid_t pid;   /* node id / process id */
+} ptl_process_id_t;
+
+typedef enum {
+        PTL_RETAIN = 0,
+        PTL_UNLINK
+} ptl_unlink_t;
+
+typedef enum {
+        PTL_INS_BEFORE,
+        PTL_INS_AFTER
+} ptl_ins_pos_t;
+
+typedef struct {
+       struct page     *kiov_page;
+       unsigned int     kiov_len;
+       unsigned int     kiov_offset;
+} ptl_kiov_t;
+
+typedef struct {
+        void            *start;
+        ptl_size_t       length;
+        int              threshold;
+        int              max_size;
+        unsigned int     options;
+        void            *user_ptr;
+        ptl_handle_eq_t  eventq;
+       unsigned int     niov;
+} ptl_md_t;
+
+/* Options for the MD structure */
+#define PTL_MD_OP_PUT           (1 << 0)
+#define PTL_MD_OP_GET           (1 << 1)
+#define PTL_MD_MANAGE_REMOTE    (1 << 2)
+#define PTL_MD_AUTO_UNLINK      (1 << 3)
+#define PTL_MD_TRUNCATE         (1 << 4)
+#define PTL_MD_ACK_DISABLE      (1 << 5)
+#define PTL_MD_IOV             (1 << 6)
+#define PTL_MD_MAX_SIZE                (1 << 7)
+#define PTL_MD_KIOV             (1 << 8)
+
+#define PTL_MD_THRESH_INF       (-1)
+
+typedef enum {
+        PTL_EVENT_GET,
+        PTL_EVENT_PUT,
+        PTL_EVENT_REPLY,
+        PTL_EVENT_ACK,
+        PTL_EVENT_SENT
+} ptl_event_kind_t;
+
+#define PTL_SEQ_BASETYPE       long
+typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
+#define PTL_SEQ_GT(a,b)        (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+
+typedef struct {
+        ptl_event_kind_t type;
+        ptl_process_id_t initiator;
+        ptl_pt_index_t portal;
+        ptl_match_bits_t match_bits;
+        ptl_size_t rlength, mlength, offset;
+        ptl_handle_me_t unlinked_me;
+        ptl_md_t mem_desc;
+        ptl_hdr_data_t hdr_data;
+        cycles_t  arrival_time;
+        volatile ptl_seq_t sequence;
+} ptl_event_t;
+
+
+typedef enum {
+        PTL_ACK_REQ,
+        PTL_NOACK_REQ
+} ptl_ack_req_t;
+
+
+typedef struct {
+        volatile ptl_seq_t sequence;
+        ptl_size_t size;
+        ptl_event_t *base;
+        ptl_handle_any_t cb_eq_handle;
+} ptl_eq_t;
+
+typedef struct {
+        ptl_eq_t *eq;
+} ptl_ni_t;
+
+
+typedef struct {
+        int max_match_entries;    /* max number of match entries */
+        int max_mem_descriptors;  /* max number of memory descriptors */
+        int max_event_queues;     /* max number of event queues */
+        int max_atable_index;     /* maximum access control list table index */
+        int max_ptable_index;     /* maximum portals table index */
+} ptl_ni_limits_t;
+
+/*
+ * Status registers
+ */
+typedef enum {
+        PTL_SR_DROP_COUNT,
+        PTL_SR_DROP_LENGTH,
+        PTL_SR_RECV_COUNT,
+        PTL_SR_RECV_LENGTH,
+        PTL_SR_SEND_COUNT,
+        PTL_SR_SEND_LENGTH,
+        PTL_SR_MSGS_MAX,
+} ptl_sr_index_t;
+
+typedef int ptl_sr_value_t;
+
+#endif
diff --git a/lnet/klnds/Makefile.am b/lnet/klnds/Makefile.am
new file mode 100644 (file)
index 0000000..5c6085e
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS= socknal toenal        @QSWNAL@ @GMNAL@ @SCIMACNAL@
diff --git a/lnet/klnds/Makefile.mk b/lnet/klnds/Makefile.mk
new file mode 100644 (file)
index 0000000..ce40a60
--- /dev/null
@@ -0,0 +1,4 @@
+include ../Kernelenv
+
+obj-y = socknal/
+# more coming...
\ No newline at end of file
diff --git a/lnet/klnds/gmlnd/Makefile.am b/lnet/klnds/gmlnd/Makefile.am
new file mode 100644 (file)
index 0000000..1dc6f4e
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kgmnal
+modulenet_DATA = kgmnal.o
+EXTRA_PROGRAMS = kgmnal
+
+DEFS =
+kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
diff --git a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch
new file mode 100644 (file)
index 0000000..23c80d9
--- /dev/null
@@ -0,0 +1,43 @@
+diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
+--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c        Mon Jul  1 10:35:09 2002
++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c    Thu Sep 19 14:19:38 2002
+@@ -30,6 +30,8 @@
+  *
+  ************************************************************************/
++#define EXPORT_SYMTAB
++
+ #include <linux/config.h>
+ #include <linux/module.h>
+@@ -4075,6 +4077,28 @@
+   return 0;
+ }
++EXPORT_SYMBOL(gm_blocking_receive_no_spin);
++EXPORT_SYMBOL(gm_close);
++EXPORT_SYMBOL(gm_dma_free);
++EXPORT_SYMBOL(gm_dma_malloc);
++EXPORT_SYMBOL(gm_drop_sends);
++EXPORT_SYMBOL(gm_finalize);
++EXPORT_SYMBOL(gm_get_node_id);
++EXPORT_SYMBOL(gm_init);
++EXPORT_SYMBOL(gm_initialize_alarm);
++EXPORT_SYMBOL(gm_max_node_id_in_use);
++EXPORT_SYMBOL(gm_min_size_for_length);
++EXPORT_SYMBOL(gm_num_receive_tokens);
++EXPORT_SYMBOL(gm_num_send_tokens);
++EXPORT_SYMBOL(gm_open);
++EXPORT_SYMBOL(gm_provide_receive_buffer);
++EXPORT_SYMBOL(gm_resume_sending);
++EXPORT_SYMBOL(gm_send_with_callback);
++EXPORT_SYMBOL(gm_set_acceptable_sizes);
++EXPORT_SYMBOL(gm_set_alarm);
++EXPORT_SYMBOL(gm_unknown);
++
++
+ /*
+   This file uses GM standard indentation.
+Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
+Only in gm-1.5.2.1_Linux-cfs/: trace
diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h
new file mode 100644 (file)
index 0000000..47e8c3c
--- /dev/null
@@ -0,0 +1,101 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _GMNAL_H
+#define _GMNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_GMNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <gm.h>
+
+
+/*
+ *  Myrinet GM NAL
+ */
+#define NPAGES_LARGE            16
+#define NPAGES_SMALL            1
+#define MSG_LEN_LARGE            NPAGES_LARGE*PAGE_SIZE
+#define MSG_LEN_SMALL            NPAGES_SMALL*PAGE_SIZE
+#define MSG_SIZE_LARGE           (gm_min_size_for_length(MSG_LEN_LARGE))
+#define MSG_SIZE_SMALL           (gm_min_size_for_length(MSG_LEN_SMALL))
+
+#define TXMSGS                  64 /* Number of Transmit Messages */
+#define ENVELOPES               8  /* Number of outstanding receive msgs */
+
+#define KGM_PORT_NUM 3
+#define KGM_HOSTNAME "kgmnal"
+
+
+typedef struct {
+        char *krx_buffer;
+        unsigned long   krx_len;
+        unsigned int   krx_size;
+        unsigned int   krx_priority;
+        struct list_head krx_item;
+}  kgmnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t  *ktx_nal;
+        void      *ktx_private;
+        lib_msg_t *ktx_cookie;
+        char      *ktx_buffer;
+        size_t     ktx_len;
+        unsigned long ktx_size;
+        int        ktx_ndx;
+        unsigned int ktx_priority;
+        unsigned int ktx_tgt_node;
+        unsigned int ktx_tgt_port_id;
+}  kgmnal_tx_t;
+
+
+typedef struct {
+        char              kgm_init;
+        char              kgm_shuttingdown;
+        struct gm_port   *kgm_port;
+        struct list_head  kgm_list;
+        ptl_nid_t         kgm_nid;
+        nal_cb_t         *kgm_cb;
+        struct kgm_trans *kgm_trans;
+        struct tq_struct  kgm_ready_tq;
+        spinlock_t        kgm_dispatch_lock;
+        spinlock_t        kgm_update_lock;
+        spinlock_t        kgm_send_lock;
+}  kgmnal_data_t;
+
+int kgm_init(kgmnal_data_t *kgm_data);
+int kgmnal_recv_thread(void *);
+int gm_return_mynid(void);
+void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+extern kgmnal_data_t      kgmnal_data;
+extern nal_t              kgmnal_api;
+extern nal_cb_t           kgmnal_lib;
+
+#endif  /* _GMNAL_H */
+
diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c
new file mode 100644 (file)
index 0000000..3d4c86d
--- /dev/null
@@ -0,0 +1,517 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* TODO
+ * preallocate send buffers, store on list
+ * put receive buffers on queue, handle with receive threads
+ * use routing
+ */
+
+#include "gmnal.h"
+
+extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
+
+static kgmnal_tx_t *
+get_trans(void)
+{
+        kgmnal_tx_t *t;
+        PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
+        return t;
+}
+
+static void
+put_trans(kgmnal_tx_t *t)
+{
+        PORTAL_FREE(t, sizeof(kgmnal_tx_t));
+}
+
+int
+kgmnal_ispeer (ptl_nid_t nid)
+{
+   unsigned int gmnid = (unsigned int)nid;
+   unsigned int nnids;
+
+   gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+   return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
+           gmnid < nnids); /* it's in this machine */
+}
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static int
+kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static void *
+kgmnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+static void
+kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list                ap;
+        char msg[256];
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void
+kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static void
+kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static int
+kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+/* FIXME rmr: add rounting code here */
+static void
+kgmnal_tx_done(kgmnal_tx_t  *trans, int error)
+{
+        lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
+
+        gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
+
+        trans->ktx_buffer = NULL;
+        trans->ktx_len = 0;
+
+        put_trans(trans);
+}
+static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
+        [GM_SUCCESS] = "GM_SUCCESS",
+        [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
+        [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
+        [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
+        [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
+        [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
+        [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
+};
+
+inline char * get_error(int status)
+{
+        if (gm_error_strings[status] != NULL)
+                return gm_error_strings[status];
+        else
+                return "Unknown error";
+}
+
+static void
+kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
+}
+
+static void
+kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
+        int err = 0;
+
+        LASSERT (p != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
+                ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
+
+        switch((int)status) {
+        case GM_SUCCESS:        /* normal */
+                break;
+        case GM_SEND_TIMED_OUT: /* application error */
+        case GM_SEND_REJECTED:  /* size of msg unacceptable */
+        case GM_SEND_TARGET_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
+                                  ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                                  kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_TARGET_NODE_UNREACHABLE:
+        case GM_SEND_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
+                              ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                              kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_DROPPED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                err = -EIO;
+                break;
+        default:
+                CERROR("Unknown status: %d\n", status);
+                err = -EIO;
+                break;
+        }
+
+        kgmnal_tx_done(ktx, err);
+}
+
+/*
+ */
+
+static int
+kgmnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type,
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           int              options,
+           unsigned int     niov,
+           lib_md_iov_t    *iov,
+           size_t           len)
+{
+        /*
+         * ipnal assumes that this is the private as passed to lib_dispatch..
+         * so do we :/
+         */
+        kgmnal_tx_t *ktx=NULL;
+        int rc=0;
+        void * buf;
+        int buf_len = sizeof(ptl_hdr_t) + len;
+        int buf_size = 0;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+        
+        PROF_START(gmnal_send);
+
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
+               len, iov, nid, KGM_PORT_NUM);
+
+        /* ensure there is an available tx handle */
+
+        /* save transaction info to trans for later finalize and cleanup */
+        ktx = get_trans();
+        if (ktx == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+
+        /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
+           header and data.
+           Also, memory must be dma'able or registered with GM. */
+
+        if (buf_len <= MSG_LEN_SMALL) {
+                buf_size = MSG_SIZE_SMALL;
+        } else if (buf_len <= MSG_LEN_LARGE) {
+                buf_size = MSG_SIZE_LARGE;
+        } else {
+                printk("kgmnal:request exceeds TX MTU size (%d).\n",
+                       MSG_SIZE_LARGE);
+                rc = -1;
+                goto send_exit;
+        }
+
+               buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
+        if (buf == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+        memcpy(buf, hdr, sizeof(ptl_hdr_t));
+
+        if (len != 0)
+                lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), 
+                                 options, niov, iov, len);
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+        ktx->ktx_len = buf_len;
+        ktx->ktx_size = buf_size;
+        ktx->ktx_buffer = buf;
+        ktx->ktx_priority = GM_LOW_PRIORITY;
+        ktx->ktx_tgt_node = nid;
+        ktx->ktx_tgt_port_id = KGM_PORT_NUM;
+
+        CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
+               "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
+               GM_LOW_PRIORITY);
+
+        gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
+                              buf_len, GM_LOW_PRIORITY,
+                              nid, KGM_PORT_NUM,
+                              kgmnal_txhandler, ktx);
+
+        PROF_FINISH(gmnal_send);
+ send_exit:
+        return rc;
+}
+void
+kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+static inline void
+kgmnal_requeue_rx(kgmnal_rx_t *krx)
+{
+        gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
+                                  krx->krx_size, krx->krx_priority);
+}
+
+/* Process a received portals packet */
+
+/* Receive Interrupt Handler */
+static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
+                      void * buf, unsigned int pri)
+{
+        ptl_hdr_t  *hdr = buf;
+        kgmnal_rx_t krx;
+
+        CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
+
+        if ( len < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (kgm->kgm_shuttingdown)
+                        return;
+                CERROR("kgmnal: did not receive complete portal header, "
+                       "len= %ld", len);
+                gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
+                return;
+        }
+
+       /* might want to use seperate threads to handle receive */
+        krx.krx_buffer = buf;
+        krx.krx_len = len;
+        krx.krx_size = size;
+        krx.krx_priority = pri;
+
+        if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
+                PROF_FINISH(lib_parse);
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx: target is "
+                       "a peer", hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented yet");
+                kgmnal_requeue_rx(&krx);
+        }
+
+        return;
+}
+
+
+static int kgmnal_recv(nal_cb_t     *nal,
+                      void         *private,
+                      lib_msg_t    *cookie,
+                      int           options,
+                      unsigned int  niov,
+                      lib_md_iov_t *iov,
+                      size_t        mlen,
+                      size_t        rlen)
+{
+        kgmnal_rx_t *krx = private;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+
+        CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
+
+        /* What was actually received must be >= what sender claims to
+         * have sent.  This is an LASSERT, since lib-move doesn't
+         * check cb return code yet. */
+        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+        LASSERT (mlen <= rlen);
+
+        PROF_START(gmnal_recv);
+
+        if(mlen != 0) {
+                PROF_START(memcpy);
+                lib_copy_buf2iov (options, niov, iov, 
+                                  krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
+                PROF_FINISH(memcpy);
+        }
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        kgmnal_requeue_rx(krx);
+
+        PROF_FINISH(gmnal_recv);
+
+        return rlen;
+}
+
+
+static void kgmnal_shutdown(void * none)
+{
+        CERROR("called\n");
+        return;
+}
+
+/*
+ * Set terminate and use alarm to wake up the recv thread.
+ */
+static void  recv_shutdown(kgmnal_data_t *kgm)
+{
+        gm_alarm_t alarm;
+
+        kgm->kgm_shuttingdown = 1;
+        gm_initialize_alarm(&alarm);
+        gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+}
+
+int kgmnal_end(kgmnal_data_t *kgm)
+{
+
+        /* wait for sends to finish ? */
+        /* remove receive buffers */
+        /* shutdown receive thread */
+
+        recv_shutdown(kgm);
+
+        return 0;
+}
+
+/* Used only for the spinner */
+int kgmnal_recv_thread(void *arg)
+{
+        kgmnal_data_t *kgm = arg;
+
+        LASSERT(kgm != NULL);
+
+        kportal_daemonize("kgmnal_rx");
+        
+        while(1) {
+                gm_recv_event_t *e;
+                int priority = GM_LOW_PRIORITY;
+                if (kgm->kgm_shuttingdown)
+                        break;
+
+                e = gm_blocking_receive_no_spin(kgm->kgm_port);
+                if (e == NULL) {
+                        CERROR("gm_blocking_receive returned NULL\n");
+                        break;
+                }
+
+                switch(gm_ntohc(e->recv.type)) {
+                case GM_HIGH_RECV_EVENT:
+                        priority = GM_HIGH_PRIORITY;
+                        /* fall through */
+                case GM_RECV_EVENT:
+                        kgmnal_rx(kgm, gm_ntohl(e->recv.length),
+                                  gm_ntohc(e->recv.size),
+                                  gm_ntohp(e->recv.buffer), priority);
+                        break;
+                case GM_ALARM_EVENT:
+                        CERROR("received alarm");
+                        gm_unknown(kgm->kgm_port, e);
+                        break;
+                case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
+                        CERROR("received bad send!\n");
+                        break;
+                default:
+                        gm_unknown(kgm->kgm_port, e);
+                }
+        }
+
+        CERROR("shuttting down.\n");
+        return 0;
+}
+
+nal_cb_t kgmnal_lib = {
+        nal_data: &kgmnal_data,                /* NAL private data */
+        cb_send: kgmnal_send,
+        cb_recv: kgmnal_recv,
+        cb_read: kgmnal_read,
+        cb_write: kgmnal_write,
+        cb_malloc: kgmnal_malloc,
+        cb_free: kgmnal_free,
+        cb_printf: kgmnal_printf,
+        cb_cli: kgmnal_cli,
+        cb_sti: kgmnal_sti,
+        cb_dist: kgmnal_dist
+};
diff --git a/lnet/klnds/gmlnd/gmnal.c b/lnet/klnds/gmlnd/gmnal.c
new file mode 100644 (file)
index 0000000..ceeea2a
--- /dev/null
@@ -0,0 +1,284 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "gmnal.h"
+
+ptl_handle_ni_t kgmnal_ni;
+nal_t  kgmnal_api;
+
+kgmnal_data_t kgmnal_data;
+int gmnal_debug = 0;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+        kprni_nalid:        GMNAL,
+        kprni_arg:        NULL,
+        kprni_fwd:          kgmnal_fwd_packet,
+};
+
+static int kgmnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+        return PTL_OK;
+}
+
+static void kgmnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int kgmnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kgmnal_api);
+        return 0;
+}
+
+static void kgmnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kgmnal_api);
+
+        if (current->need_resched)
+                schedule();
+        return;
+}
+
+kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
+{
+        kgmnal_rx_t *conn;
+
+        PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
+        /* Check for out of mem here */
+        if (conn==NULL) {
+                        printk("kgm_add_recv: memory alloc failed\n");
+                        return NULL;
+        }
+
+        list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
+        //        conn->ndx=ndx;
+        //        conn->len=conn->ptlhdr_copied=0;
+        //        conn->loopback=0;
+        return conn;
+}
+
+static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
+                          ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        unsigned int nnids;
+
+        gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
+               kgmnal_data.kgm_nid, nnids);
+        lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
+        return &kgmnal_api;
+}
+
+static void __exit
+kgmnal_finalize(void)
+{
+        struct list_head *tmp;
+
+        PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
+        PtlNIFini(kgmnal_ni);
+        lib_fini(&kgmnal_api);
+
+        if (kgmnal_data.kgm_port) {
+                gm_close(kgmnal_data.kgm_port);
+        }
+
+        /* FIXME: free dma buffers */
+        /* FIXME: kill receiver thread */
+
+        PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
+
+        list_for_each(tmp, &kgmnal_data.kgm_list) {
+                kgmnal_rx_t *conn;
+                conn = list_entry(tmp, kgmnal_rx_t, krx_item);
+                CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
+                tmp = tmp->next;
+                list_del(&conn->krx_item);
+                PORTAL_FREE(conn, sizeof(*conn));
+        }
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+static int __init
+kgmnal_initialize(void)
+{
+        int rc;
+        int ntok;
+        unsigned long sizemask;
+        unsigned int nid;
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kgmnal_api.forward = kgmnal_forward;
+        kgmnal_api.shutdown = kgmnal_shutdown;
+        kgmnal_api.yield = kgmnal_yield;
+        kgmnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kgmnal_api.lock= kgmnal_lock;
+        kgmnal_api.unlock= kgmnal_unlock;
+        kgmnal_api.nal_data = &kgmnal_data;
+
+        kgmnal_lib.nal_data = &kgmnal_data;
+
+        memset(&kgmnal_data, 0, sizeof(kgmnal_data));
+
+        INIT_LIST_HEAD(&kgmnal_data.kgm_list);
+        kgmnal_data.kgm_cb = &kgmnal_lib;
+
+        /* Allocate transmit descriptors */
+        PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
+        if (kgmnal_data.kgm_trans==NULL) {
+                printk("kgmnal: init: failed to allocate transmit "
+                       "descriptors\n");
+                return -1;
+        }
+        memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
+
+        spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
+        spin_lock_init(&kgmnal_data.kgm_update_lock);
+        spin_lock_init(&kgmnal_data.kgm_send_lock);
+
+        /* Do the receiver and xmtr allocation */
+
+        rc = gm_init();
+        if (rc != GM_SUCCESS) {
+                CERROR("gm_init failed: %d\n", rc);
+                return -1;
+        }
+
+        rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
+                     GM_API_VERSION_1_1);
+        if (rc != GM_SUCCESS) {
+                gm_finalize();
+                kgmnal_data.kgm_port = NULL;
+                CERROR("gm_open failed: %d\n", rc);
+                return -1;
+        }
+        gm_get_node_id(kgmnal_data.kgm_port, &nid);
+        kgmnal_data.kgm_nid = nid;
+        /* Allocate 2 different sizes of buffers. For new, use half
+           the tokens for each. */
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
+               ntok, MSG_LEN_LARGE);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_LARGE);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+        }
+
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
+               ntok, MSG_LEN_SMALL);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_SMALL);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+        }
+        sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
+        CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
+                        kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
+                                sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
+
+        /* Initialize Network Interface */
+        rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                return (-ENOMEM);
+        }
+
+        /* Start receiver thread */
+        kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
+
+        PORTAL_SYMBOL_REGISTER(kgmnal_ni);
+
+        kgmnal_data.kgm_init = 1;
+
+        return 0;
+}
+
+MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
+MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
+MODULE_LICENSE("GPL");
+
+module_init (kgmnal_initialize);
+module_exit (kgmnal_finalize);
+
+EXPORT_SYMBOL (kgmnal_ni);
diff --git a/lnet/klnds/qswlnd/Makefile.am b/lnet/klnds/qswlnd/Makefile.am
new file mode 100644 (file)
index 0000000..6759b96
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kqswnal
+modulenet_DATA = kqswnal.o
+EXTRA_PROGRAMS = kqswnal
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h
diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c
new file mode 100644 (file)
index 0000000..d64b7ad
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+ptl_handle_ni_t                kqswnal_ni;
+nal_t                  kqswnal_api;
+kqswnal_data_t         kqswnal_data;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+       kprni_nalid:    QSWNAL,
+       kprni_arg:      NULL,
+       kprni_fwd:      kqswnal_fwd_packet,
+};
+
+
+static int
+kqswnal_forward(nal_t   *nal,
+               int     id,
+               void    *args,  size_t args_len,
+               void    *ret,   size_t ret_len)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+       return (PTL_OK);
+}
+
+static void
+kqswnal_lock (nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void
+kqswnal_unlock(nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int
+kqswnal_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+
+       LASSERT (nal == &kqswnal_api);
+       return (0);
+}
+
+static void
+kqswnal_yield( nal_t *nal )
+{
+       CDEBUG (D_NET, "yield\n");
+
+       if (current->need_resched)
+               schedule();
+       return;
+}
+
+static nal_t *
+kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
+            ptl_pid_t requested_pid)
+{
+       ptl_nid_t mynid = ep_nodeid (kqswnal_data.kqn_epdev);
+       int       nnids = ep_numnodes (kqswnal_data.kqn_epdev);
+
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid,nnids);
+
+       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
+
+       return (&kqswnal_api);
+}
+
+void __exit
+kqswnal_finalise (void)
+{
+       switch (kqswnal_data.kqn_init)
+       {
+       default:
+               LASSERT (0);
+
+       case KQN_INIT_ALL:
+               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
+               /* fall through */
+
+       case KQN_INIT_PTL:
+               PtlNIFini (kqswnal_ni);
+               lib_fini (&kqswnal_lib);
+               /* fall through */
+
+       case KQN_INIT_DATA:
+               break;
+
+       case KQN_INIT_NOTHING:
+               return;
+       }
+
+       /**********************************************************************/
+       /* Make router stop her calling me and fail any more call-ins */
+       kpr_shutdown (&kqswnal_data.kqn_router);
+
+       /**********************************************************************/
+       /* flag threads to terminate, wake them and wait for them to die */
+
+       kqswnal_data.kqn_shuttingdown = 1;
+       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+
+       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
+               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
+                      atomic_read (&kqswnal_data.kqn_nthreads));
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+
+       /**********************************************************************/
+       /* close elan comms */
+
+       if (kqswnal_data.kqn_eprx_small != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
+
+       if (kqswnal_data.kqn_eprx_large != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
+
+       if (kqswnal_data.kqn_eptx != NULL)
+               ep_free_large_xmtr (kqswnal_data.kqn_eptx);
+
+       /**********************************************************************/
+       /* No more threads.  No more portals, router or comms callbacks!
+        * I control the horizontals and the verticals...
+        */
+
+       /**********************************************************************/
+       /* Complete any blocked forwarding packets with error
+        */
+
+       while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       /**********************************************************************/
+       /* Wait for router to complete any packets I sent her
+        */
+
+       kpr_deregister (&kqswnal_data.kqn_router);
+
+
+       /**********************************************************************/
+       /* Unmap message buffers and free all descriptors and buffers
+        */
+
+       if (kqswnal_data.kqn_eprxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle, 0,
+                                 KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                                 KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_eptxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle, 0,
+                                 KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
+                                                     KQSW_NNBLK_TXMSGS));
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_txds != NULL)
+       {
+               int   i;
+
+               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
+               {
+                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+
+                       if (ktx->ktx_buffer != NULL)
+                               PORTAL_FREE(ktx->ktx_buffer,
+                                           KQSW_TX_BUFFER_SIZE);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_txds,
+                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
+                                                    KQSW_NNBLK_TXMSGS));
+       }
+
+       if (kqswnal_data.kqn_rxds != NULL)
+       {
+               int   i;
+               int   j;
+
+               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+               {
+                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+                       for (j = 0; j < krx->krx_npages; j++)
+                               if (krx->krx_pages[j] != NULL)
+                                       __free_page (krx->krx_pages[j]);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_rxds,
+                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
+                                                   KQSW_NRXMSGS_LARGE));
+       }
+
+       /* resets flags, pointers to NULL etc */
+       memset(&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
+
+       printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+                atomic_read(&portal_kmemory));
+}
+
+static int __init
+kqswnal_initialise (void)
+{
+       ELAN3_DMA_REQUEST dmareq;
+       int               rc;
+       int               i;
+       int               elan_page_idx;
+       int               pkmem = atomic_read(&portal_kmemory);
+
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+
+       kqswnal_api.forward  = kqswnal_forward;
+       kqswnal_api.shutdown = kqswnal_shutdown;
+       kqswnal_api.yield    = kqswnal_yield;
+       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
+       kqswnal_api.lock     = kqswnal_lock;
+       kqswnal_api.unlock   = kqswnal_unlock;
+       kqswnal_api.nal_data = &kqswnal_data;
+
+       kqswnal_lib.nal_data = &kqswnal_data;
+
+       /* ensure all pointers NULL etc */
+       memset (&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       kqswnal_data.kqn_cb = &kqswnal_lib;
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
+       spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq);
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
+
+       spin_lock_init (&kqswnal_data.kqn_sched_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
+
+       spin_lock_init (&kqswnal_data.kqn_statelock);
+
+       /* pointers/lists/locks initialised */
+       kqswnal_data.kqn_init = KQN_INIT_DATA;
+
+       /**********************************************************************/
+       /* Find the first Elan device */
+
+       kqswnal_data.kqn_epdev = ep_device (0);
+       if (kqswnal_data.kqn_epdev == NULL)
+       {
+               CERROR ("Can't get elan device 0\n");
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the transmitter */
+
+       kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev);
+       if (kqswnal_data.kqn_eptx == NULL)
+       {
+               CERROR ("Can't allocate transmitter\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the receivers */
+
+       kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_SMALL,
+                                                            KQSW_EP_ENVELOPES_SMALL);
+       if (kqswnal_data.kqn_eprx_small == NULL)
+       {
+               CERROR ("Can't install small msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_LARGE,
+                                                            KQSW_EP_ENVELOPES_LARGE);
+       if (kqswnal_data.kqn_eprx_large == NULL)
+       {
+               CERROR ("Can't install large msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for transmit buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEREAD;
+
+       rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState,
+                             KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                             &dmareq, &kqswnal_data.kqn_eptxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for receive buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
+
+       rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState,
+                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               &dmareq, &kqswnal_data.kqn_eprxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise transmit descriptors */
+
+       PORTAL_ALLOC(kqswnal_data.kqn_txds,
+                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       if (kqswnal_data.kqn_txds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /* clear flags, null pointers etc */
+       memset(kqswnal_data.kqn_txds, 0,
+              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
+       {
+               int           premapped_pages;
+               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+               int           basepage = i * KQSW_NTXMSGPAGES;
+
+               PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+               if (ktx->ktx_buffer == NULL)
+               {
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+
+               /* Map pre-allocated buffer NOW, to save latency on transmit */
+               premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
+                                                       KQSW_TX_BUFFER_SIZE);
+
+               elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                      kqswnal_data.kqn_eptxdmahandle,
+                                      ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
+                                      basepage, &ktx->ktx_ebuffer);
+
+               ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
+               ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
+
+               if (i < KQSW_NTXMSGS)
+                       ktx->ktx_idle = &kqswnal_data.kqn_idletxds;
+               else
+                       ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds;
+
+               list_add_tail (&ktx->ktx_list, ktx->ktx_idle);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise receive descriptors */
+
+       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
+                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
+       if (kqswnal_data.kqn_rxds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
+              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
+
+       elan_page_idx = 0;
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               E3_Addr       elanaddr;
+               int           j;
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               if (i < KQSW_NRXMSGS_SMALL)
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_small;
+               }
+               else
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_large;
+               }
+
+               LASSERT (krx->krx_npages > 0);
+               for (j = 0; j < krx->krx_npages; j++)
+               {
+                       krx->krx_pages[j] = alloc_page (GFP_KERNEL);
+                       if (krx->krx_pages[j] == NULL)
+                       {
+                               kqswnal_finalise ();
+                               return (-ENOMEM);
+                       }
+
+                       LASSERT(page_address(krx->krx_pages[j]) != NULL);
+
+                       elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState,
+                                             kqswnal_data.kqn_eprxdmahandle,
+                                             page_address(krx->krx_pages[j]),
+                                             PAGE_SIZE, elan_page_idx,
+                                             &elanaddr);
+                       elan_page_idx++;
+
+                       if (j == 0)
+                               krx->krx_elanaddr = elanaddr;
+
+                       /* NB we assume a contiguous  */
+                       LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE);
+               }
+       }
+       LASSERT (elan_page_idx ==
+                (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) +
+                (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE));
+
+       /**********************************************************************/
+       /* Network interface ready to initialise */
+
+        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
+        if (rc != 0)
+       {
+               CERROR ("PtlNIInit failed %d\n", rc);
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_init = KQN_INIT_PTL;
+
+       /**********************************************************************/
+       /* Queue receives, now that it's OK to run their completion callbacks */
+
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               /* NB this enqueue can allocate/sleep (attr == 0) */
+               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
+                                     krx->krx_elanaddr,
+                                     krx->krx_npages * PAGE_SIZE, 0);
+               if (rc != 0)
+               {
+                       CERROR ("failed ep_queue_receive %d\n", rc);
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+       }
+
+       /**********************************************************************/
+       /* Spawn scheduling threads */
+       for (i = 0; i < smp_num_cpus; i++)
+       {
+               rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
+               if (rc != 0)
+               {
+                       CERROR ("failed to spawn scheduling thread: %d\n", rc);
+                       kqswnal_finalise ();
+                       return (rc);
+               }
+       }
+
+       /**********************************************************************/
+       /* Connect to the router */
+       rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
+       CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
+
+       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
+       kqswnal_data.kqn_init = KQN_INIT_ALL;
+
+       printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+              "(Routing %s, initial mem %d)\n", 
+              ep_nodeid (kqswnal_data.kqn_epdev),
+              ep_numnodes (kqswnal_data.kqn_epdev),
+              kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
+              pkmem);
+
+       return (0);
+}
+
+
+MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>");
+MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00");
+MODULE_LICENSE("GPL");
+
+module_init (kqswnal_initialise);
+module_exit (kqswnal_finalise);
+
+EXPORT_SYMBOL (kqswnal_ni);
diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h
new file mode 100644 (file)
index 0000000..657b02b
--- /dev/null
@@ -0,0 +1,249 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _QSWNAL_H
+#define _QSWNAL_H
+#define EXPORT_SYMTAB
+
+#ifdef PROPRIETARY_ELAN
+# include <qsw/kernel.h>
+#else
+# include <qsnet/kernel.h>
+#endif
+
+#undef printf                                   /* nasty QSW #define */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <elan3/elanregs.h>
+#include <elan3/elandev.h>
+#include <elan3/elanvp.h>
+#include <elan3/elan3mmu.h>
+#include <elan3/elanctxt.h>
+#include <elan3/elandebug.h>
+#include <elan3/urom_addrs.h>
+#include <elan3/busops.h>
+#include <elan3/kcomm.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_QSWNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define KQSW_CHECKSUM  0
+#if KQSW_CHECKSUM
+typedef unsigned long kqsw_csum_t;
+#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t))
+#else
+#define KQSW_CSUM_SIZE 0
+#endif
+#define KQSW_HDR_SIZE  (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE)
+
+/*
+ *  Elan NAL
+ */
+#define EP_SVC_LARGE_PORTALS_SMALL     (0x10)  /* Portals over elan port number (large payloads) */
+#define EP_SVC_LARGE_PORTALS_LARGE     (0x11)  /* Portals over elan port number (small payloads) */
+/* NB small/large message sizes are GLOBAL constants */
+
+/*
+ * Performance Tuning defines
+ * NB no mention of PAGE_SIZE for interoperability
+ */
+#if PTL_LARGE_MTU
+# define KQSW_MAXPAYLOAD               (256<<10) /* biggest message this NAL will cope with */
+#else
+# define KQSW_MAXPAYLOAD               (64<<10) /* biggest message this NAL will cope with */
+#endif
+
+#define KQSW_SMALLPAYLOAD              ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
+
+#define KQSW_TX_MAXCONTIG              (1<<10) /* largest payload that gets made contiguous on transmit */
+
+#define KQSW_NTXMSGS                   8       /* # normal transmit messages */
+#define KQSW_NNBLK_TXMSGS              128     /* # reserved transmit messages if can't block */
+
+#define KQSW_NRXMSGS_LARGE             64      /* # large receive buffers */
+#define KQSW_EP_ENVELOPES_LARGE        128     /* # large ep envelopes */
+
+#define KQSW_NRXMSGS_SMALL             256     /* # small receive buffers */
+#define KQSW_EP_ENVELOPES_SMALL                2048    /* # small ep envelopes */
+
+#define KQSW_RESCHED                   100     /* # busy loops that forces scheduler to yield */
+
+/*
+ * derived constants
+ */
+
+#define KQSW_TX_BUFFER_SIZE    (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG)
+/* The pre-allocated tx buffer (hdr + small payload) */
+
+#define KQSW_NTXMSGPAGES       (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1)
+/* Reserve elan address space for pre-allocated and pre-mapped transmit
+ * buffer and a full payload too.  Extra pages allow for page alignment */
+
+#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
+
+#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
+/* biggest complete packet we can receive (or transmit) */
+
+
+typedef struct 
+{
+        struct list_head krx_list;              /* enqueue -> thread */
+        EP_RCVR                *krx_eprx;              /* port to post receives to */
+        EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
+        E3_Addr          krx_elanaddr;          /* Elan address of buffer (contiguous in elan vm) */
+        int              krx_npages;            /* # pages in receive buffer */
+        int              krx_nob;               /* Number Of Bytes received into buffer */
+        kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
+        struct page     *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */
+        struct iovec     krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */
+}  kqswnal_rx_t;
+
+typedef struct
+{
+        struct list_head  ktx_list;             /* enqueue idle/delayed */
+        struct list_head *ktx_idle;             /* where to put when idle */
+        char              ktx_state;            /* What I'm doing */
+        uint32_t          ktx_basepage;         /* page offset in reserved elan tx vaddrs for mapping pages */
+        int               ktx_npages;           /* pages reserved for mapping messages */
+        int               ktx_nmappedpages;     /* # pages mapped for current message */
+        EP_IOVEC         ktx_iov[EP_MAXFRAG];  /* msg frags (elan vaddrs) */
+        int               ktx_niov;             /* # message frags */
+        int               ktx_port;             /* destination ep port */
+        ptl_nid_t         ktx_nid;              /* destination node */
+        void             *ktx_args[2];          /* completion passthru */
+        E3_Addr                  ktx_ebuffer;          /* elan address of ktx_buffer */
+        char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
+} kqswnal_tx_t;
+
+#define KTX_IDLE       0                       /* MUST BE ZERO (so zeroed ktx is idle) */
+#define KTX_SENDING    1                       /* local send */
+#define KTX_FORWARDING 2                       /* routing a packet */
+
+typedef struct
+{
+        char               kqn_init;            /* what's been initialised */
+        char               kqn_shuttingdown;    /* I'm trying to shut down */
+        atomic_t           kqn_nthreads;        /* # threads still running */
+
+        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+
+        struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
+        struct list_head   kqn_nblk_idletxds;   /* reserve of */
+        spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
+        wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
+        struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
+        
+        spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
+        wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
+
+        struct list_head   kqn_readyrxds;       /* rxds full of data */
+        struct list_head   kqn_delayedfwds;     /* delayed forwards */
+        struct list_head   kqn_delayedtxds;     /* delayed transmits */
+
+        spinlock_t         kqn_statelock;       /* cb_cli/cb_sti */
+        nal_cb_t          *kqn_cb;              /* -> kqswnal_lib */
+       EP_DEV            *kqn_epdev;           /* elan device */
+       EP_XMTR           *kqn_eptx;            /* elan transmitter */
+       EP_RCVR           *kqn_eprx_small;      /* elan receiver (small messages) */
+        EP_RCVR                  *kqn_eprx_large;      /* elan receiver (large messages) */
+       ELAN3_DMA_HANDLE  *kqn_eptxdmahandle;   /* elan reserved tx vaddrs */
+       ELAN3_DMA_HANDLE  *kqn_eprxdmahandle;   /* elan reserved rx vaddrs */
+        kpr_router_t       kqn_router;          /* connection to Kernel Portals Router module */
+}  kqswnal_data_t;
+
+/* kqn_init state */
+#define KQN_INIT_NOTHING       0               /* MUST BE ZERO so zeroed state is initialised OK */
+#define KQN_INIT_DATA          1
+#define KQN_INIT_PTL           2
+#define KQN_INIT_ALL           3
+
+extern nal_cb_t        kqswnal_lib;
+extern nal_t           kqswnal_api;
+extern kqswnal_data_t  kqswnal_data;
+
+extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
+extern void kqswnal_rxhandler(EP_RXD *rxd);
+extern int kqswnal_scheduler (void *);
+extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+static inline void
+kqswnal_requeue_rx (kqswnal_rx_t *krx)
+{
+        ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx,
+                            krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE);
+}
+
+static inline int
+kqswnal_pages_spanned (void *base, int nob)
+{
+        unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
+        unsigned long last_page  = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
+
+        LASSERT (last_page >= first_page);      /* can't wrap address space */
+        return (last_page - first_page + 1);
+}
+
+#if KQSW_CHECKSUM
+static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
+{
+        unsigned char *ptr = (unsigned char *)base;
+        
+        while (nob-- > 0)
+                sum += *ptr++;
+        
+        return (sum);
+}
+#endif
+
+#endif /* _QSWNAL_H */
diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c
new file mode 100644 (file)
index 0000000..5979885
--- /dev/null
@@ -0,0 +1,1242 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+atomic_t kqswnal_packets_launched;
+atomic_t kqswnal_packets_transmitted;
+atomic_t kqswnal_packets_received;
+
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static int
+kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+              size_t len)
+{
+        CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static void *
+kqswnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return (buf);
+}
+
+static void
+kqswnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kqswnal_printf (nal_cb_t * nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap);        /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;                /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+
+static void
+kqswnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kqn_statelock, *flags);
+}
+
+
+static void
+kqswnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kqn_statelock, *flags);
+}
+
+
+static int
+kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        *dist = (nid == nal->ni.nid) ? 0 : 1;
+        return (0);
+}
+
+int
+kqswnal_ispeer (ptl_nid_t nid)
+{
+        unsigned int elanid = (unsigned int)nid;
+
+        /* didn't lose high bits on conversion and it's in this machine? */
+        return ((ptl_nid_t)elanid == nid &&
+                elanid < ep_numnodes (kqswnal_data.kqn_epdev));
+}
+
+void
+kqswnal_unmap_tx (kqswnal_tx_t *ktx)
+{
+        if (ktx->ktx_nmappedpages == 0)
+                return;
+
+        CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n",
+                ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages);
+
+        LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages);
+        LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <=
+                 kqswnal_data.kqn_eptxdmahandle->NumDvmaPages);
+
+        elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                          kqswnal_data.kqn_eptxdmahandle,
+                          ktx->ktx_basepage, ktx->ktx_nmappedpages);
+        ktx->ktx_nmappedpages = 0;
+}
+
+int
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+        char     *ptr;
+        
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        do {
+                int  fraglen = kiov->kiov_len;
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                /* each frag fits in a page */
+                LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
+
+                nmapped++;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                /* XXX this is really crap, but we'll have to kmap until
+                 * EKC has a page (rather than vaddr) mapping interface */
+
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, page %d, %d total\n",
+                        ktx, nfrags, ptr, fraglen, basepage, nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       ptr, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+
+                kunmap (kiov->kiov_page);
+                
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage++;
+                kiov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+int
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        do {
+                int  fraglen = iov->iov_len;
+                long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                
+                nmapped += npages;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
+                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
+                        nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       iov->iov_base, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage += npages;
+                iov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+void
+kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
+{
+        kpr_fwd_desc_t   *fwd = NULL;
+        struct list_head *idle = ktx->ktx_idle;
+        unsigned long     flags;
+
+        kqswnal_unmap_tx (ktx);                /* release temporary mappings */
+        ktx->ktx_state = KTX_IDLE;
+
+        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        list_add (&ktx->ktx_list, idle);
+
+        /* reserved for non-blocking tx */
+        if (idle == &kqswnal_data.kqn_nblk_idletxds) {
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+                return;
+        }
+
+        /* anything blocking for a tx descriptor? */
+        if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
+        {
+                CDEBUG(D_NET,"wakeup fwd\n");
+
+                fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                  kpr_fwd_desc_t, kprfd_list);
+                list_del (&fwd->kprfd_list);
+        }
+
+        if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq))  /* process? */
+        {
+                /* local sender waiting for tx desc */
+                CDEBUG(D_NET,"wakeup process\n");
+                wake_up (&kqswnal_data.kqn_idletxd_waitq);
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        if (fwd == NULL)
+                return;
+
+        /* schedule packet for forwarding again */
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+kqswnal_tx_t *
+kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
+{
+        unsigned long  flags;
+        kqswnal_tx_t  *ktx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kqswnal_data.kqn_idletxds)) {
+                        ktx = list_entry (kqswnal_data.kqn_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* "normal" descriptor pool is empty */
+
+                if (fwd != NULL) { /* forwarded packet => queue for idle txd */
+                        CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
+                        list_add_tail (&fwd->kprfd_list,
+                                       &kqswnal_data.kqn_idletxd_fwdq);
+                        break;
+                }
+
+                /* doing a local transmit */
+                if (!may_block) {
+                        if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) {
+                                CERROR ("intr tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                CDEBUG (D_NET, "blocking for tx desc\n");
+                wait_event (kqswnal_data.kqn_idletxd_waitq,
+                            !list_empty (&kqswnal_data.kqn_idletxds));
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
+        LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0);
+        return (ktx);
+}
+
+void
+kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
+{
+        switch (ktx->ktx_state) {
+        case KTX_FORWARDING:       /* router asked me to forward this packet */
+                kpr_fwd_done (&kqswnal_data.kqn_router,
+                              (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
+                break;
+
+        case KTX_SENDING:          /* packet sourced locally */
+                lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+                              (lib_msg_t *)ktx->ktx_args[1]);
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        kqswnal_put_idle_tx (ktx);
+}
+
+static void
+kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
+{
+        kqswnal_tx_t      *ktx = (kqswnal_tx_t *)arg;
+
+        LASSERT (txd != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status);
+
+        if (status == EP_SUCCESS)
+                atomic_inc (&kqswnal_packets_transmitted);
+
+        if (status != EP_SUCCESS)
+        {
+                CERROR ("kqswnal: Transmit failed with %d\n", status);
+                status = -EIO;
+        }
+
+        kqswnal_tx_done (ktx, status);
+}
+
+int
+kqswnal_launch (kqswnal_tx_t *ktx)
+{
+        /* Don't block for transmit descriptor if we're in interrupt context */
+        int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
+        int   rc   = ep_transmit_large(kqswnal_data.kqn_eptx, ktx->ktx_nid,
+                                       ktx->ktx_port, attr, kqswnal_txhandler,
+                                       ktx, ktx->ktx_iov, ktx->ktx_niov);
+        long  flags;
+
+        if (rc == 0)
+                atomic_inc (&kqswnal_packets_launched);
+
+        if (rc != ENOMEM)
+                return (rc);
+
+        /* can't allocate ep txd => queue for later */
+
+        LASSERT (in_interrupt());      /* not called by thread (not looping) */
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        return (0);
+}
+
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+static void
+kqswnal_cerror_hdr(ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        CERROR("P3 Header at %p of type %s\n", hdr, type_str);
+        CERROR("    From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid),
+               NTOH__u32(hdr->src_pid));
+        CERROR("    To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid),
+               NTOH__u32(hdr->dest_pid));
+
+        switch (NTOH__u32(hdr->type)) {
+        case PTL_MSG_PUT:
+                CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.wh_interface_cookie,
+                       hdr->msg.put.ack_wmd.wh_object_cookie,
+                       NTOH__u64 (hdr->msg.put.match_bits));
+                CERROR("    Length %d, offset %d, hdr data "LPX64"\n",
+                       NTOH__u32(PTL_HDR_LENGTH(hdr)),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.get.ptl_index),
+                       hdr->msg.get.return_wmd.wh_interface_cookie,
+                       hdr->msg.get.return_wmd.wh_object_cookie,
+                       hdr->msg.get.match_bits);
+                CERROR("    Length %d, src offset %d\n",
+                       NTOH__u32 (hdr->msg.get.sink_length),
+                       NTOH__u32 (hdr->msg.get.src_offset));
+                break;
+
+        case PTL_MSG_ACK:
+                CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (hdr->msg.ack.mlength));
+                break;
+
+        case PTL_MSG_REPLY:
+                CERROR("    dst md "LPX64"."LPX64", length %d\n",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (PTL_HDR_LENGTH(hdr)));
+        }
+
+}                               /* end of print_hdr() */
+
+static int
+kqswnal_sendmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 ptl_kiov_t   *payload_kiov,
+                 size_t        payload_nob)
+{
+        kqswnal_tx_t      *ktx;
+        int                rc;
+        ptl_nid_t          gatewaynid;
+#if KQSW_CHECKSUM
+        int                i;
+        kqsw_csum_t        csum;
+        int                sumnob;
+#endif
+        
+        /* NB, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
+               " pid %u\n", payload_nob, payload_niov, nid, pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        
+        if (payload_nob > KQSW_MAXPAYLOAD) {
+                CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
+                        payload_nob, KQSW_MAXPAYLOAD);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        if (!kqswnal_ispeer (nid)) {     /* Can't send direct: find gateway? */
+                rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                if (!kqswnal_ispeer (gatewaynid)) {
+                        CERROR("Bad gateway "LPX64" for "LPX64"\n",
+                               gatewaynid, nid);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                nid = gatewaynid;
+        }
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
+                                          type == PTL_MSG_REPLY ||
+                                          in_interrupt()));
+        if (ktx == NULL) {
+                kqswnal_cerror_hdr (hdr);
+                lib_finalize (&kqswnal_lib, private, cookie);
+        }
+
+        memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
+
+#if KQSW_CHECKSUM
+        csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
+        memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
+        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+                if (payload_kiov != NULL) {
+                        ptl_kiov_t *kiov = &payload_kiov[i];
+                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
+                                           kiov->kiov_offset;
+                        
+                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
+                        sumnob -= kiov->kiov_len;
+                } else {
+                        struct iovec *iov = &payload_iov[i];
+
+                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
+                        sumnob -= iov->iov_len;
+                }
+        }
+        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+#endif
+
+        /* Set up first frag from pre-mapped buffer (it's at least the
+         * portals header) */
+        ktx->ktx_iov[0].Base = ktx->ktx_ebuffer;
+        ktx->ktx_iov[0].Len = KQSW_HDR_SIZE;
+        ktx->ktx_niov = 1;
+
+        if (payload_nob > 0) { /* got some payload (something more to do) */
+                /* make a single contiguous message? */
+                if (payload_nob <= KQSW_TX_MAXCONTIG) {
+                        /* copy payload to ktx_buffer, immediately after hdr */
+                        if (payload_kiov != NULL)
+                                lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                   payload_niov, payload_kiov, payload_nob);
+                        else
+                                lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                  payload_niov, payload_iov, payload_nob);
+                        /* first frag includes payload */
+                        ktx->ktx_iov[0].Len += payload_nob;
+                } else {
+                        if (payload_kiov != NULL)
+                                rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                                                          payload_niov, payload_kiov);
+                        else
+                                rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                                                         payload_niov, payload_iov);
+                        if (rc != 0) {
+                                kqswnal_put_idle_tx (ktx);
+                                lib_finalize (&kqswnal_lib, private, cookie);
+                                return (-1);
+                        }
+                } 
+        }
+
+        ktx->ktx_port    = (payload_nob <= KQSW_SMALLPAYLOAD) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_SENDING;   /* => lib_finalize() on completion */
+        ktx->ktx_args[0] = private;
+        ktx->ktx_args[1] = cookie;
+
+        rc = kqswnal_launch (ktx);
+        if (rc != 0) {                    /* failed? */
+                CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid);
+        return (0);
+}
+
+static int
+kqswnal_send (nal_cb_t     *nal,
+              void         *private,
+              lib_msg_t    *cookie,
+              ptl_hdr_t    *hdr,
+              int           type,
+              ptl_nid_t     nid,
+              ptl_pid_t     pid,
+              unsigned int  payload_niov,
+              struct iovec *payload_iov,
+              size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kqswnal_send_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    ptl_hdr_t    *hdr,
+                    int           type,
+                    ptl_nid_t     nid,
+                    ptl_pid_t     pid,
+                    unsigned int  payload_niov,
+                    ptl_kiov_t   *payload_kiov,
+                    size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+int kqswnal_fwd_copy_contig = 0;
+
+void
+kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        int             rc;
+        kqswnal_tx_t   *ktx;
+        struct iovec   *iov = fwd->kprfd_iov;
+        int             niov = fwd->kprfd_niov;
+        int             nob = fwd->kprfd_nob;
+        ptl_nid_t       nid = fwd->kprfd_gateway_nid;
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        /* The router wants this NAL to forward a packet */
+        CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n",
+                fwd, nid, niov, nob);
+
+        LASSERT (niov > 0);
+        
+        ktx = kqswnal_get_idle_tx (fwd, FALSE);
+        if (ktx == NULL)        /* can't get txd right now */
+                return;         /* fwd will be scheduled when tx desc freed */
+
+        if (nid == kqswnal_lib.ni.nid)          /* gateway is me */
+                nid = fwd->kprfd_target_nid;    /* target is final dest */
+
+        if (!kqswnal_ispeer (nid)) {
+                CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
+                rc = -EHOSTUNREACH;
+                goto failed;
+        }
+
+        if (nob > KQSW_NRXMSGBYTES_LARGE) {
+                CERROR ("Can't forward [%p] to "LPX64
+                        ": size %d bigger than max packet size %ld\n",
+                        fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE);
+                rc = -EMSGSIZE;
+                goto failed;
+        }
+
+        if ((kqswnal_fwd_copy_contig || niov > 1) &&
+            nob <= KQSW_TX_BUFFER_SIZE) 
+        {
+                /* send from ktx's pre-allocated/mapped contiguous buffer? */
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */
+                ktx->ktx_iov[0].Len = nob;
+                ktx->ktx_niov = 1;
+        }
+        else
+        {
+                /* zero copy */
+                ktx->ktx_niov = 0;        /* no frags mapped yet */
+                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                if (rc != 0)
+                        goto failed;
+        }
+
+        ktx->ktx_port    = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_FORWARDING; /* kpr_put_packet() on completion */
+        ktx->ktx_args[0] = fwd;
+
+        rc = kqswnal_launch (ktx);
+        if (rc == 0)
+                return;
+
+ failed:
+        LASSERT (rc != 0);
+        CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
+
+        kqswnal_put_idle_tx (ktx);
+        /* complete now (with failure) */
+        kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc);
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)arg;
+
+        /* The router has finished forwarding this packet */
+
+        if (error != 0)
+        {
+                ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error);
+        }
+
+        kqswnal_requeue_rx (krx);
+}
+
+void
+kqswnal_rx (kqswnal_rx_t *krx)
+{
+        ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]);
+        ptl_nid_t       dest_nid = NTOH__u64 (hdr->dest_nid);
+        int             nob;
+        int             niov;
+
+        if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */
+                /* NB krx requeued when lib_parse() calls back kqswnal_recv */
+                lib_parse (&kqswnal_lib, hdr, krx);
+                return;
+        }
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        if (kqswnal_ispeer (dest_nid))  /* should have gone direct to peer */
+        {
+                CERROR("dropping packet from "LPX64" for "LPX64
+                       ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        /* NB forwarding may destroy iov; rebuild every time */
+        for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++)
+        {
+                LASSERT (niov < krx->krx_npages);
+                krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]);
+                krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob);
+        }
+
+        kpr_fwd_init (&krx->krx_fwd, dest_nid,
+                      krx->krx_nob, niov, krx->krx_iov,
+                      kqswnal_fwd_callback, krx);
+
+        kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd);
+}
+
+/* Receive Interrupt Handler: posts to schedulers */
+void 
+kqswnal_rxhandler(EP_RXD *rxd)
+{
+        long          flags;
+        int           nob    = ep_rxd_len (rxd);
+        int           status = ep_rxd_status (rxd);
+        kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
+
+        CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
+               rxd, krx, nob, status);
+
+        LASSERT (krx != NULL);
+
+        krx->krx_rxd = rxd;
+        krx->krx_nob = nob;
+
+        /* must receive a whole header to be able to parse */
+        if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
+        {
+                /* receives complete with failure when receiver is removed */
+                if (kqswnal_data.kqn_shuttingdown)
+                        return;
+
+                CERROR("receive status failed with status %d nob %d\n",
+                       ep_rxd_status(rxd), nob);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        atomic_inc (&kqswnal_packets_received);
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+#if KQSW_CHECKSUM
+void
+kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
+{
+        ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+        CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
+                ", dpid %d, spid %d, type %d\n",
+                ishdr ? "Header" : "Payload", krx,
+                NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid)
+                NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid),
+                NTOH__u32(hdr->type));
+
+        switch (NTOH__u32 (hdr->type))
+        {
+        case PTL_MSG_ACK:
+                CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
+                       " len %u\n",
+                       NTOH__u32(hdr->msg.ack.mlength),
+                       hdr->msg.ack.dst_wmd.handle_cookie,
+                       hdr->msg.ack.dst_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.ack.match_bits),
+                       NTOH__u32(hdr->msg.ack.length));
+                break;
+        case PTL_MSG_PUT:
+                CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
+                       " len %u off %u data "LPX64"\n",
+                       NTOH__u32(hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.handle_cookie,
+                       hdr->msg.put.ack_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.put.match_bits),
+                       NTOH__u32(hdr->msg.put.length),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+        case PTL_MSG_GET:
+                CERROR ("GET: <>\n");
+                break;
+        case PTL_MSG_REPLY:
+                CERROR ("REPLY: <>\n");
+                break;
+        default:
+                CERROR ("TYPE?: <>\n");
+        }
+}
+#endif
+
+static int
+kqswnal_recvmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 unsigned int  niov,
+                 struct iovec *iov,
+                 ptl_kiov_t   *kiov,
+                 size_t        mlen,
+                 size_t        rlen)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
+        int           page;
+        char         *page_ptr;
+        int           page_nob;
+        char         *iov_ptr;
+        int           iov_nob;
+        int           frag;
+#if KQSW_CHECKSUM
+        kqsw_csum_t   senders_csum;
+        kqsw_csum_t   payload_csum = 0;
+        kqsw_csum_t   hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]),
+                                           sizeof(ptl_hdr_t));
+        size_t        csum_len = mlen;
+        int           csum_frags = 0;
+        int           csum_nob = 0;
+        static atomic_t csum_counter;
+        int           csum_verbose = (atomic_read(&csum_counter)%1000001) == 0;
+
+        atomic_inc (&csum_counter);
+
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                                sizeof (ptl_hdr_t), sizeof (kqsw_csum_t));
+        if (senders_csum != hdr_csum)
+                kqswnal_csum_error (krx, 1);
+#endif
+        CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
+
+        /* What was actually received must be >= payload.
+         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
+        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        LASSERT (mlen <= rlen);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+        
+        if (mlen != 0)
+        {
+                page     = 0;
+                page_ptr = ((char *) page_address(krx->krx_pages[0])) +
+                        KQSW_HDR_SIZE;
+                page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
+
+                LASSERT (niov > 0);
+                if (kiov != NULL) {
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                        iov_nob = kiov->kiov_len;
+                } else {
+                        iov_ptr = iov->iov_base;
+                        iov_nob = iov->iov_len;
+                }
+
+                for (;;)
+                {
+                        /* We expect the iov to exactly match mlen */
+                        LASSERT (iov_nob <= mlen);
+                        
+                        frag = MIN (page_nob, iov_nob);
+                        memcpy (iov_ptr, page_ptr, frag);
+#if KQSW_CHECKSUM
+                        payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
+                        csum_nob += frag;
+                        csum_frags++;
+#endif
+                        mlen -= frag;
+                        if (mlen == 0)
+                                break;
+
+                        page_nob -= frag;
+                        if (page_nob != 0)
+                                page_ptr += frag;
+                        else
+                        {
+                                page++;
+                                LASSERT (page < krx->krx_npages);
+                                page_ptr = page_address(krx->krx_pages[page]);
+                                page_nob = PAGE_SIZE;
+                        }
+
+                        iov_nob -= frag;
+                        if (iov_nob != 0)
+                                iov_ptr += frag;
+                        else if (kiov != NULL) {
+                                kunmap (kiov->kiov_page);
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                                iov_nob = kiov->kiov_len;
+                        } else {
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = iov->iov_base;
+                                iov_nob = iov->iov_len;
+                        }
+                }
+
+                if (kiov != NULL)
+                        kunmap (kiov->kiov_page);
+        }
+
+#if KQSW_CHECKSUM
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t));
+
+        if (csum_len != rlen)
+                CERROR("Unable to checksum data in user's buffer\n");
+        else if (senders_csum != payload_csum)
+                kqswnal_csum_error (krx, 0);
+
+        if (csum_verbose)
+                CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, "
+                       "csum_nob %d\n",
+                        hdr_csum, payload_csum, csum_frags, csum_nob);
+#endif
+        lib_finalize(nal, private, cookie);
+
+        kqswnal_requeue_rx (krx);
+
+        return (rlen);
+}
+
+static int
+kqswnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+static int
+kqswnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+int
+kqswnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kqswnal_data.kqn_nthreads);
+        return (0);
+}
+
+void
+kqswnal_thread_fini (void)
+{
+        atomic_dec (&kqswnal_data.kqn_nthreads);
+}
+
+int
+kqswnal_scheduler (void *arg)
+{
+        kqswnal_rx_t    *krx;
+        kqswnal_tx_t    *ktx;
+        kpr_fwd_desc_t  *fwd;
+        long             flags;
+        int              rc;
+        int              counter = 0;
+        int              did_something;
+
+        kportal_daemonize ("kqswnal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        while (!kqswnal_data.kqn_shuttingdown)
+        {
+                did_something = FALSE;
+
+                if (!list_empty (&kqswnal_data.kqn_readyrxds))
+                {
+                        krx = list_entry(kqswnal_data.kqn_readyrxds.next,
+                                         kqswnal_rx_t, krx_list);
+                        list_del (&krx->krx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        kqswnal_rx (krx);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
+                {
+                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
+                                         kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        rc = kqswnal_launch (ktx);
+                        if (rc != 0)          /* failed: ktx_nid down? */
+                        {
+                                CERROR("Failed delayed transmit to "LPX64
+                                       ": %d\n", ktx->ktx_nid, rc);
+                                kqswnal_tx_done (ktx, rc);
+                        }
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
+                {
+                        fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
+                        list_del (&fwd->kprfd_list);
+                        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+                        kqswnal_fwd_packet (NULL, fwd);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                    /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == KQSW_RESCHED) {
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
+                                                               kqswnal_data.kqn_shuttingdown ||
+                                                               !list_empty(&kqswnal_data.kqn_readyrxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedtxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedfwds));
+                                LASSERT (rc == 0);
+                        } else if (current->need_resched)
+                                schedule ();
+
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        kqswnal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t kqswnal_lib =
+{
+        nal_data:       &kqswnal_data,         /* NAL private data */
+        cb_send:        kqswnal_send,
+        cb_send_pages:  kqswnal_send_pages,
+        cb_recv:        kqswnal_recv,
+        cb_recv_pages:  kqswnal_recv_pages,
+        cb_read:        kqswnal_read,
+        cb_write:       kqswnal_write,
+        cb_malloc:      kqswnal_malloc,
+        cb_free:        kqswnal_free,
+        cb_printf:      kqswnal_printf,
+        cb_cli:         kqswnal_cli,
+        cb_sti:         kqswnal_sti,
+        cb_dist:        kqswnal_dist
+};
diff --git a/lnet/klnds/scimaclnd/Makefile.am b/lnet/klnds/scimaclnd/Makefile.am
new file mode 100644 (file)
index 0000000..6da31f0
--- /dev/null
@@ -0,0 +1,11 @@
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kscimacnal
+modulenet_DATA = kscimacnal.o
+EXTRA_PROGRAMS = kscimacnal
+
+DEFS =
+kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h
diff --git a/lnet/klnds/scimaclnd/README.scimacnal b/lnet/klnds/scimaclnd/README.scimacnal
new file mode 100644 (file)
index 0000000..d4c6a49
--- /dev/null
@@ -0,0 +1,14 @@
+
+scimacnal - A NAL for the Scali ScaMAC midlayer.
+
+The ScaMAC midlayer is a simplified API to the SCI high performance
+interconnect.
+
+In order to use this NAL you'll need to tune scimac to use larger buffers.
+See scimac.conf in this directory for an example.
+
+Overall performance and stability isn't great but this can be attributed
+to the scimac driver which apparently is in need of some development.
+
+TODO:
+Routing isn't yet implemented.
diff --git a/lnet/klnds/scimaclnd/scimac.conf b/lnet/klnds/scimaclnd/scimac.conf
new file mode 100644 (file)
index 0000000..bfb6d02
--- /dev/null
@@ -0,0 +1,35 @@
+#  Configuration file for the scimac driver - lustre friendly settings
+#
+
+#  The maximal number of message headers to use in the system.
+scimac_max_no_hdrs = 32
+
+#  The maximal number of eager buffers to use in the system.
+scimac_max_no_ebufs = 8
+
+#  The maximal size in bytes of each eager buffer.
+scimac_max_ebuf_size = 65536
+
+#  Enable use of a kernel thread to defer reception of packets.
+#  Default is to use a tasklet (sw interrupt).
+scimac_use_ulevel_recv = 1
+
+#  The maximal number of packets queued for transfer per path at any one time. 
+scimac_max_send_queuelen = 2000
+
+#  The packet retransmit time in milliseconds.
+#  The time elapsed since a packet was attempted sent until the packet is resent.
+scimac_pkt_rexmit_time = 200
+
+#  The packet's maximal retransmit time in milliseconds.
+#  The total time that a packet will be attempted sent before it is dropped.
+scimac_max_rexmit_time = 5000
+
+#  The lowest valid node identifier in the system.
+scimac_min_nodeid_number = 0x100
+
+#  The largest valid node identifier in the system.
+scimac_max_nodeid_number = 0xff00
+
+#  The incremental nodeid step in the system.
+scimac_nodeid_increment = 0x100
diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c
new file mode 100644 (file)
index 0000000..1066d69
--- /dev/null
@@ -0,0 +1,219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ * Based on gmnal, which is based on ksocknal and qswnal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "scimacnal.h"
+
+ptl_handle_ni_t kscimacnal_ni;
+nal_t  kscimacnal_api;
+
+kscimacnal_data_t kscimacnal_data;
+
+kpr_nal_interface_t kscimacnal_router_interface = {
+        kprni_nalid:    SCIMACNAL,
+        kprni_arg:      NULL,
+        kprni_fwd:      kscimacnal_fwd_packet,
+};
+
+
+static int kscimacnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */
+        return PTL_OK;
+}
+
+
+static void kscimacnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+
+static void kscimacnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+
+static int kscimacnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kscimacnal_api);
+        return 0;
+}
+
+
+static void kscimacnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kscimacnal_api);
+
+        if (current->need_resched) 
+                schedule();
+        return;
+}
+
+
+static nal_t *kscimacnal_init(int interface, ptl_pt_index_t  ptl_size,
+                ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        int     nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids);
+        lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); 
+        return &kscimacnal_api;
+}
+
+
+/* Called by kernel at module unload time */
+static void __exit 
+kscimacnal_finalize(void)
+{
+        /* FIXME: How should the shutdown procedure really look? */
+        kscimacnal_data.ksci_shuttingdown=1;
+
+        PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni);
+
+        PtlNIFini(kscimacnal_ni);
+        lib_fini(&kscimacnal_lib);
+
+        mac_finish(kscimacnal_data.ksci_machandle);
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+
+/* Called by kernel at module insertion time */
+static int __init
+kscimacnal_initialize(void)
+{
+        int rc;
+        unsigned long     nid=0;
+        mac_handle_t    *machandle = NULL;
+
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kscimacnal_api.forward = kscimacnal_forward;
+        kscimacnal_api.shutdown = kscimacnal_shutdown;
+        kscimacnal_api.yield = kscimacnal_yield;
+        kscimacnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kscimacnal_api.lock= kscimacnal_lock;
+        kscimacnal_api.unlock= kscimacnal_unlock;
+        kscimacnal_api.nal_data = &kscimacnal_data;
+
+        kscimacnal_lib.nal_data = &kscimacnal_data;
+
+        memset(&kscimacnal_data, 0, sizeof(kscimacnal_data));
+
+        kscimacnal_data.ksci_cb = &kscimacnal_lib;
+
+        /* We're not using this, but cli/sti callbacks does... ??? */
+        spin_lock_init(&kscimacnal_data.ksci_dispatch_lock);
+
+        /* FIXME: We only support one adapter for now */
+        machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx,
+                        &kscimacnal_data);
+
+        if(!machandle) {
+                CERROR("mac_init() failed\n");
+                return -1;
+        }
+
+        kscimacnal_data.ksci_machandle = machandle;
+
+        /* Make sure the scimac MTU is tuned */
+        if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) {
+                CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n",
+                                mac_get_mtusize(machandle), SCIMACNAL_MTU);
+                CERROR("Consult README.scimacnal for more information\n");
+                mac_finish(machandle);
+                return -1;
+        }
+
+        /* Get the node ID */
+        /* mac_get_physaddrlen() is a function instead of define, sigh */
+        LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid));
+        if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) {
+                CERROR("mac_get_physaddr() failed\n");
+                mac_finish(machandle);
+                return -1;
+        }
+        nid = ntohl(nid);
+        kscimacnal_data.ksci_nid = nid;
+
+
+        /* Initialize Network Interface */
+        /* FIXME: What do the magic numbers mean? Documentation anyone? */
+        rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                mac_finish(machandle);
+                return (-ENOMEM);
+        }
+
+        PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
+
+        /* We're done now, it's OK for the RX callback to do stuff */
+        kscimacnal_data.ksci_init = 1;
+
+        return 0;
+}
+
+
+MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_LICENSE("GPL");
+
+module_init (kscimacnal_initialize);
+module_exit (kscimacnal_finalize);
+
+EXPORT_SYMBOL(kscimacnal_ni);
diff --git a/lnet/klnds/scimaclnd/scimacnal.h b/lnet/klnds/scimaclnd/scimacnal.h
new file mode 100644 (file)
index 0000000..1ff180e
--- /dev/null
@@ -0,0 +1,85 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+ */
+
+
+#ifndef _SCIMACNAL_H
+#define _SCIMACNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <asm/page.h>            /* For PAGE_SIZE */
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <scamac.h>
+
+#ifndef MAC_SAPID_LUSTRE
+#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
+#endif /* MAC_SAPID_LUSTRE */
+
+#define SCIMACNAL_MTU 65536
+/* FIXME: What is really the MTU of lustre? */
+#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#endif
+
+typedef struct {
+        mac_handle_t    *handle;
+        mac_mblk_t      *msg;
+        mac_msg_type_t   type;
+        void            *userdata;
+}  kscimacnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t        *ktx_nal;
+        void            *ktx_private;
+        lib_msg_t       *ktx_cookie;
+        ptl_hdr_t       ktx_hdr;
+}  kscimacnal_tx_t;
+
+
+typedef struct {
+        char              ksci_init;
+        char              ksci_shuttingdown;
+        ptl_nid_t         ksci_nid;
+        nal_cb_t         *ksci_cb;
+        spinlock_t        ksci_dispatch_lock;
+        mac_handle_t     *ksci_machandle;
+}  kscimacnal_data_t;
+
+extern kscimacnal_data_t   kscimacnal_data;
+extern nal_t            kscimacnal_api;
+extern nal_cb_t         kscimacnal_lib;
+
+void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata);
+
+
+#endif  /* _SCIMACNAL_H */
diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c
new file mode 100644 (file)
index 0000000..7e4a2e8
--- /dev/null
@@ -0,0 +1,468 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "scimacnal.h"
+
+static int 
+kscimacnal_read (nal_cb_t *nal, void *private,
+                void *dst_addr, user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static int 
+kscimacnal_write(nal_cb_t *nal, void *private,
+                user_ptr dst_addr, void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static void *
+kscimacnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+
+static void 
+kscimacnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+
+static void 
+kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list         ap;
+        char msg[256]; 
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void 
+kscimacnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static void 
+kscimacnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data; 
+
+        spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static int 
+kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* FIXME: Network distance has a meaning, but is there no easy
+         * way to figure it out (depends on routing) */
+
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+
+static
+char * get_mac_error(mac_status_t status) 
+{
+        switch(status) {
+                case MAC_MSG_STAT_OK:
+                        return "MAC_MSG_STAT_OK";
+                case MAC_MSG_STAT_FREED:
+                        return "MAC_MSG_STAT_FREED";
+                case MAC_MSG_STAT_ABORTED:
+                        return "MAC_MSG_STAT_ABORTED";
+                case MAC_MSG_STAT_TIMEDOUT:
+                        return "MAC_MSG_STAT_TIMEDOUT";
+                case MAC_MSG_STAT_NODEUNREACH:
+                        return "MAC_MSG_STAT_NODEUNREACH";
+                case MAC_MSG_STAT_NETDOWN:
+                        return "MAC_MSG_STAT_NETDOWN";
+                case MAC_MSG_STAT_RESET:
+                        return "MAC_MSG_STAT_RESET";
+                case MAC_MSG_STAT_INITFAILED:
+                        return "MAC_MSG_STAT_INITFAILED";
+                case MAC_MSG_STAT_SYNCFAILED:
+                        return "MAC_MSG_STAT_SYNCFAILED";
+                case MAC_MSG_STAT_BADPROTO:
+                        return "MAC_MSG_STAT_BADPROTO";
+                case MAC_MSG_STAT_NOBUFSPACE:
+                        return "MAC_MSG_STAT_NOBUFSPACE";
+                case MAC_MSG_STAT_CONGESTION:
+                        return "MAC_MSG_STAT_CONGESTION";
+                case MAC_MSG_STAT_OTHER:
+                        return "MAC_MSG_STAT_OTHER";
+                default:
+                        return "Unknown error";
+        }
+}
+
+
+/* FIXME add routing code here ? */
+
+/* Called by ScaMac when transmission is complete  (ie. message is released) */
+static void 
+kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
+{
+        kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
+        int err=0;
+        
+        LASSERT (ktx != NULL);
+
+        /* Euh, there is no feedback when transmission fails?! */
+        switch(status) {
+                case MAC_MSG_STAT_OK:        /* normal */
+                        break;
+                default:
+                        CERROR("%s (%d):\n", get_mac_error(status), status);
+                        err = -EIO;
+                        break;
+        }
+
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+
+        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+}
+
+
+/* Called by portals when it wants to send a message.
+ * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
+static int 
+kscimacnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type, 
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           unsigned int     payload_niov,
+           struct iovec    *payload_iov,
+           size_t           payload_len)
+{
+        kscimacnal_tx_t    *ktx=NULL;
+        kscimacnal_data_t  *ksci = nal->nal_data;
+        int              rc=0;
+        int              buf_len = sizeof(ptl_hdr_t) + payload_len;
+        mac_mblk_t      *msg=NULL, *lastblk, *newblk;
+        unsigned long   physaddr;
+        
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, nid, payload_niov);
+
+        LASSERT(ksci != NULL);
+
+        LASSERT(hdr != NULL);
+
+        /* Do real check if we can send this */
+        if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
+                CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
+                                mac_get_mtusize(ksci->ksci_machandle));
+                return -EINVAL;
+        }
+
+
+        /* save transaction info for later finalize and cleanup */
+        PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
+        if (!ktx) {
+                return -ENOMEM;
+        }
+
+        /* *SIGH* hdr is a stack variable in the calling function, so we
+         * need to copy it to a buffer. Zerocopy magic (or is it just
+         * deferred memcpy?) is annoying sometimes.  */
+        memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t));
+
+        /* First, put the header in the main message mblk */
+        msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t),
+                        kscimacnal_txrelease, ktx);
+        if (!msg) {
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return -ENOMEM;
+        }
+        mac_put_mblk(msg, sizeof(ptl_hdr_t));
+        lastblk=msg;
+
+        /* Allocate additional mblks for each iov as needed.
+         * Essentially lib_copy_iov2buf with a twist or two */
+        while (payload_len > 0)
+        {
+                ptl_size_t nob;
+
+                LASSERT (payload_niov > 0);
+
+                nob = MIN (payload_iov->iov_len, payload_len);
+
+                /* We don't need a callback on the additional mblks, since
+                 * all release callbacks seems to be called when the entire
+                 * message has been sent */
+                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
+                if(!newblk) {
+                        mac_free_msg(msg);
+                        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                        return -ENOMEM;
+                }
+                mac_put_mblk(newblk, nob);
+                mac_link_mblk(lastblk, newblk);
+                lastblk=newblk;
+
+                payload_len -= nob;
+                payload_niov--;
+                payload_iov++;
+        }
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+
+        CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid);
+
+        physaddr = htonl(nid);
+
+        if((rc=mac_send(ksci->ksci_machandle, msg,
+                                        (mac_physaddr_t *) &physaddr))) {
+                CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
+                mac_free_msg(msg);
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return rc;
+        }
+
+        return 0;
+}
+
+
+void
+kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+/* Process a received portals packet */
+/* Called by the ScaMac RX thread when a packet is received */
+void
+kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
+                void *userdata)
+{
+        ptl_hdr_t       *hdr = NULL;
+        kscimacnal_rx_t     krx; 
+        mac_size_t       size;
+        kscimacnal_data_t  *ksci = userdata;
+
+        LASSERT(ksci != NULL);
+
+        if ( !ksci->ksci_init || ksci->ksci_shuttingdown || 
+                    type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) {
+                /* We're not interested in messages not for us, ignore */
+                mac_free_msg(msg);
+                return;
+        }
+
+        size = mac_msg_size(msg);
+
+        CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", 
+                        msg, type, size, mac_msg_mblks(msg));
+
+        if( size < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (ksci->ksci_shuttingdown)
+                        return;
+                CERROR("kscimacnal: did not receive complete portal header,"
+                                "size= %ld\n", size);
+                /* Free the message before exiting */
+                mac_free_msg(msg);
+                return;
+        }
+
+        /* Provide everything we know */
+        krx.handle = handle;
+        krx.msg = msg;
+        krx.type = type;
+        krx.userdata = userdata;
+
+        /* mac_msg_next returns the next mblk with unread data */
+        hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) );
+
+        if(!hdr) {
+                CERROR("kscimacnal: no data block in message %p\n", msg);
+                mac_free_msg(msg);
+                return;
+        }
+
+        if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc and calls our callback */
+                lib_parse(&kscimacnal_lib, hdr, &krx);
+                PROF_FINISH(lib_parse);
+#if 0 /* FIXME: Is it possible to detect this? */
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx:"
+                                "target is a  peer\n",
+                                hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+#endif /* if 0 FIXME */
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n",
+                                kscimacnal_lib.ni.nid, hdr->dest_nid);
+        }
+
+        mac_free_msg(msg);
+
+        CDEBUG(D_NET, "msg %p: Done\n", msg);
+}
+
+
+/* Called by portals to process a recieved packet */
+static int kscimacnal_recv(nal_cb_t     *nal, 
+                      void         *private, 
+                      lib_msg_t    *cookie, 
+                      unsigned int  niov, 
+                      struct iovec *iov, 
+                      size_t        mlen, 
+                      size_t        rlen)
+{
+        kscimacnal_rx_t    *krx = private;
+        mac_mblk_t      *mblk;
+        void            *src;
+        mac_size_t       pkt_len;
+        ptl_size_t       iovused=0;
+
+        LASSERT (krx != NULL);
+        LASSERT (krx->msg != NULL);
+
+        CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n",
+                        krx->msg, mlen, rlen, niov);
+
+        /* What was actually received must be >= what sender claims to have
+         * sent.  This is an LASSERT, since lib-move doesn't check cb return
+         * code yet. Also, rlen seems to be negative when mlen==0 so don't
+         * assert on that.
+         */
+        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
+        LASSERT (mlen==0 || mlen <= rlen);
+
+        PROF_START(memcpy);
+
+        /* mac_msg_next returns next mblk with unread data (ie. can
+         * be same mblk */
+        while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) {
+                pkt_len = mac_mblk_len(mblk);
+                src = mac_get_mblk(mblk, pkt_len); /* Next unread block */
+
+                CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld  src: %p\n",
+                                krx->msg, mblk, pkt_len, src);
+
+                LASSERT(src != NULL);
+
+                /* Essentially lib_copy_buf2iov but with continuation support,
+                 * we "gracefully" thrash the argument vars ;) */
+                while (pkt_len > 0) {
+                        ptl_size_t nob;
+
+                        LASSERT (niov > 0);
+
+                        LASSERT(iovused < iov->iov_len);
+
+                        nob = MIN (iov->iov_len-iovused, pkt_len);
+                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                                        "iovused: %d\n",
+                                        iov->iov_base, iov->iov_len,
+                                        src, nob, iovused);
+
+                        memcpy (iov->iov_base+iovused, src, nob);
+                        pkt_len -= nob;
+                        src += nob;
+
+                        if(nob+iovused < iov->iov_len) {
+                                /* We didn't use all of the iov */
+                                iovused+=nob;
+                        }
+                        else {
+                                niov--;
+                                iov++;
+                                iovused=0;
+                        }
+                }
+        }
+        PROF_FINISH(memcpy);
+
+        CDEBUG(D_NET, "Calling lib_finalize.\n");
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        CDEBUG(D_NET, "Done.\n");
+
+        return rlen;
+}
+
+
+nal_cb_t kscimacnal_lib = {
+        nal_data:       &kscimacnal_data,               /* NAL private data */
+        cb_send:         kscimacnal_send,
+        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_recv:         kscimacnal_recv,
+        cb_recv_pages:   NULL,
+        cb_read:         kscimacnal_read,
+        cb_write:        kscimacnal_write,
+        cb_malloc:       kscimacnal_malloc,
+        cb_free:         kscimacnal_free,
+        cb_printf:       kscimacnal_printf,
+        cb_cli:          kscimacnal_cli,
+        cb_sti:          kscimacnal_sti,
+        cb_dist:         kscimacnal_dist
+};
diff --git a/lnet/klnds/socklnd/Makefile.am b/lnet/klnds/socklnd/Makefile.am
new file mode 100644 (file)
index 0000000..437d7fc
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ksocknal
+modulenet_DATA = ksocknal.o
+EXTRA_PROGRAMS = ksocknal
+
+DEFS =
+ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h
diff --git a/lnet/klnds/socklnd/Makefile.mk b/lnet/klnds/socklnd/Makefile.mk
new file mode 100644 (file)
index 0000000..46edf01
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Kernelenv
+
+obj-y += ksocknal.o
+ksocknal-objs    := socknal.o socknal_cb.o
+
diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c
new file mode 100644 (file)
index 0000000..d15d8c8
--- /dev/null
@@ -0,0 +1,863 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+ptl_handle_ni_t         ksocknal_ni;
+static nal_t            ksocknal_api;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ksock_nal_data_t ksocknal_data;
+#else
+static ksock_nal_data_t ksocknal_data;
+#endif
+
+kpr_nal_interface_t ksocknal_router_interface = {
+        kprni_nalid:      SOCKNAL,
+        kprni_arg:        &ksocknal_data,
+        kprni_fwd:        ksocknal_fwd_packet,
+};
+
+
+int
+ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */
+        return PTL_OK;
+}
+
+int
+ksocknal_api_shutdown(nal_t *nal, int ni)
+{
+        CDEBUG (D_NET, "closing all connections\n");
+
+        return ksocknal_close_sock(0);          /* close all sockets */
+}
+
+void
+ksocknal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ksocknal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ksocknal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ksocknal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ksocknal_data.ksnd_mynid);
+        lib_init(&ksocknal_lib, ksocknal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ksocknal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ksocknal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ksocknal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->nid);
+
+        ksocknal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+void
+ksocknal_bind_irq (unsigned int irq, int cpu)
+{
+#if (defined(CONFIG_SMP) && CPU_AFFINITY)
+        char  cmdline[64];
+        char *argv[] = {"/bin/sh",
+                        "-c",
+                        cmdline,
+                        NULL};
+        char *envp[] = {"HOME=/",
+                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                        NULL};
+
+        snprintf (cmdline, sizeof (cmdline),
+                  "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
+
+        printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+                irq, cpu, cmdline);
+
+        /* FIXME: Find a better method of setting IRQ affinity...
+         */
+
+        call_usermodehelper (argv[0], argv, envp);
+#endif
+}
+
+int
+ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        ksock_sched_t     *sched = NULL;
+        unsigned int       irq = 0;
+        struct net_device *dev = NULL;
+        int                ret;
+        int                idx;
+        ENTRY;
+
+        LASSERT (!in_interrupt());
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_saved_data_ready = sock->sk->data_ready;
+        conn->ksnc_saved_write_space = sock->sk->write_space;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ksocknal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+#warning check it is OK to derefence sk->dst_cache->dev like this...
+        lock_sock (conn->ksnc_sock->sk);
+
+        if (conn->ksnc_sock->sk->dst_cache != NULL) {
+                dev = conn->ksnc_sock->sk->dst_cache->dev;
+                if (dev != NULL) {
+                        irq = dev->irq;
+                        if (irq >= NR_IRQS) {
+                                CERROR ("Unexpected IRQ %x\n", irq);
+                                irq = 0;
+                        }
+                }
+        }
+
+        release_sock (conn->ksnc_sock->sk);
+
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (irq == 0 ||
+            ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) {
+                /* This is a software NIC, or we haven't associated it with
+                 * a CPU yet */
+
+                /* Choose the CPU with the fewest connections */
+                sched = ksocknal_data.ksnd_schedulers;
+                for (idx = 1; idx < SOCKNAL_N_SCHED; idx++)
+                        if (sched->kss_nconns >
+                            ksocknal_data.ksnd_schedulers[idx].kss_nconns)
+                                sched = &ksocknal_data.ksnd_schedulers[idx];
+
+                if (irq != 0) {                 /* Hardware NIC */
+                        /* Remember which scheduler we chose */
+                        idx = sched - ksocknal_data.ksnd_schedulers;
+
+                        LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK);
+
+                        if (bind_irq)       /* remember if we will bind below */
+                                idx |= SOCKNAL_IRQ_BOUND;
+
+                        ksocknal_data.ksnd_irq_info[irq] = idx;
+                }
+        } else { 
+                /* This is a hardware NIC, associated with a CPU */
+                idx = ksocknal_data.ksnd_irq_info[irq];
+
+                /* Don't bind again if we've bound already */
+                if ((idx & SOCKNAL_IRQ_BOUND) != 0)
+                        bind_irq = 0;
+                
+                sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK];
+        }
+
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+        list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist);
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (bind_irq &&                         /* irq binding required */
+            irq != 0)                           /* hardware NIC */
+                ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers);
+
+        /* NOW it's safe to get called back when socket is ready... */
+        sock->sk->user_data = conn;
+        sock->sk->data_ready = ksocknal_data_ready;
+        sock->sk->write_space = ksocknal_write_space;
+
+        /* ...which I call right now to get things going */
+        ksocknal_data_ready (sock->sk, 0);
+        ksocknal_write_space (sock->sk);
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ksocknal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0) {                         /* close ALL connections */
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ksocknal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ksocknal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid && list_empty (&death_row))
+                return (-ENOENT);
+
+        while (!list_empty (&death_row)) {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+
+                /* NB I _have_ to restore the callback, rather than storing
+                 * a noop, since the socket could survive past this module
+                 * being unloaded!! */
+                conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready;
+                conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space;
+
+                /* OK; no more callbacks, but they could be in progress now,
+                 * so wait for them to complete... */
+                write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+                /* ...however if I get the lock before a callback gets it,
+                 * this will make them noop
+                 */
+                conn->ksnc_sock->sk->user_data = NULL;
+
+                /* And drop the scheduler's connection count while I've got
+                 * the exclusive lock */
+                conn->ksnc_scheduler->kss_nconns--;
+
+                write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock,
+                                        flags);
+
+                ksocknal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        }
+
+        return (0);
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        return &(sk->tp_pinfo.af_tcp);
+}
+#else
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        struct tcp_sock *s = (struct tcp_sock *)sk;
+        return &s->tcp;
+}
+#endif
+
+void
+ksocknal_push_conn (ksock_conn_t *conn)
+{
+        struct sock    *sk = conn->ksnc_sock->sk;
+        struct tcp_opt *tp = sock2tcp_opt(sk);
+        int             nonagle;
+        int             val = 1;
+        int             rc;
+        mm_segment_t    oldmm;
+
+        lock_sock (sk);
+        nonagle = tp->nonagle;
+        tp->nonagle = 1;
+        release_sock (sk);
+
+        oldmm = get_fs ();
+        set_fs (KERNEL_DS);
+
+        rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                   (char *)&val, sizeof (val));
+        LASSERT (rc == 0);
+
+        set_fs (oldmm);
+
+        lock_sock (sk);
+        tp->nonagle = nonagle;
+        release_sock (sk);
+}
+
+/* Passing in a zero nid pushes all connections */
+int
+ksocknal_push_sock (ptl_nid_t nid)
+{
+        ksock_conn_t      *conn;
+        struct list_head  *tmp;
+        int                index;
+        int                i;
+
+        if (nid != 0) {
+                conn = ksocknal_get_conn (nid);
+
+                if (conn == NULL)
+                        return (-ENOENT);
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+
+                return (0);
+        }
+
+        /* NB we can't remove connections from the socket list so we have to
+         * cope with them being removed from under us...
+         */
+        for (index = 0; ; index++) {
+                read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+                i = 0;
+                conn = NULL;
+
+                list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+                        if (i++ == index) {
+                                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                                atomic_inc (&conn->ksnc_refcount); // take a ref
+                                break;
+                        }
+                }
+
+                read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+        }
+
+        return (0);
+}
+
+ksock_conn_t *
+ksocknal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n",
+               nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ksocknal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ksocknal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready);
+        LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space);
+        LASSERT (conn->ksnc_sock->sk->user_data == NULL);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt()) {
+                ksocknal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list);
+        wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+}
+
+int
+ksocknal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd,
+                                       data->ioc_flags);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ksocknal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ksocknal_set_mynid (data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_PUSH_CONNECTION: {
+                rc = ksocknal_push_sock (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+void
+ksocknal_free_buffers (void)
+{
+        if (ksocknal_data.ksnd_fmbs != NULL) {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0;
+                     i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS);
+                     i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ksocknal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                     SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ksocknal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS +
+                                                     SOCKNAL_NNBLK_LTXS));
+
+        if (ksocknal_data.ksnd_schedulers != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_schedulers,
+                             sizeof (ksock_sched_t) * SOCKNAL_N_SCHED);
+}
+
+void __exit
+ksocknal_module_fini (void)
+{
+        int   i;
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(SOCKNAL);
+                PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ksocknal_ni);
+                lib_fini(&ksocknal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ksocknal_data.ksnd_socklist));
+                LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                                ksock_sched_t *kss =
+                                        &ksocknal_data.ksnd_schedulers[i];
+
+                                LASSERT (list_empty (&kss->kss_tx_conns));
+                                LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (kss->kss_nconns == 0);
+                        }
+
+                /* stop router calling me */
+                kpr_shutdown (&ksocknal_data.ksnd_router);
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ksocknal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ksocknal_data.ksnd_reaper_waitq);
+
+                for (i = 0; i < SOCKNAL_N_SCHED; i++)
+                       wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq);
+
+                while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ksocknal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ksocknal_data.ksnd_router);
+
+                ksocknal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+
+int __init
+ksocknal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ksocknal_api.forward  = ksocknal_api_forward;
+        ksocknal_api.shutdown = ksocknal_api_shutdown;
+        ksocknal_api.yield    = ksocknal_api_yield;
+        ksocknal_api.validate = NULL;           /* our api validate is a NOOP */
+        ksocknal_api.lock     = ksocknal_api_lock;
+        ksocknal_api.unlock   = ksocknal_api_unlock;
+        ksocknal_api.nal_data = &ksocknal_data;
+
+        ksocknal_lib.nal_data = &ksocknal_data;
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist);
+        rwlock_init(&ksocknal_data.ksnd_socklist_lock);
+
+        ksocknal_data.ksnd_nal_cb = &ksocknal_lib;
+        spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
+        INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+        memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED,
+                sizeof (ksocknal_data.ksnd_irq_info));
+
+        /* flag lists/ptrs/locks initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
+                     sizeof(ksock_sched_t) * SOCKNAL_N_SCHED);
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                RETURN(-ENOMEM);
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+
+                spin_lock_init (&kss->kss_lock);
+                INIT_LIST_HEAD (&kss->kss_rx_conns);
+                INIT_LIST_HEAD (&kss->kss_tx_conns);
+#if SOCKNAL_ZC
+                INIT_LIST_HEAD (&kss->kss_zctxdone_list);
+#endif
+                init_waitqueue_head (&kss->kss_waitq);
+        }
+
+        CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t),
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_ltxs,
+                     sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS));
+        if (ksocknal_data.ksnd_ltxs == NULL) {
+                ksocknal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ksocknal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ksocknal_data.ksnd_idle_ltx_list :
+                                &ksocknal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ksocknal_ni, ~0);
+
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                rc = ksocknal_thread_start (ksocknal_scheduler,
+                                            &ksocknal_data.ksnd_schedulers[i]);
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
+                               i, rc);
+                        ksocknal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ksocknal_thread_start (ksocknal_reaper, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ksocknal_data.ksnd_router,
+                          &ksocknal_router_interface);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't initialise routing interface "
+                       "(rc = %d): not routing\n", rc);
+        } else {
+                /* Only allocate forwarding buffers if I'm on a gateway */
+
+                PORTAL_ALLOC(ksocknal_data.ksnd_fmbs,
+                             sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                    SOCKNAL_LARGE_FWD_NMSGS));
+                if (ksocknal_data.ksnd_fmbs == NULL) {
+                        ksocknal_module_fini ();
+                        RETURN(-ENOMEM);
+                }
+
+                /* NULL out buffer pointers etc */
+                memset(ksocknal_data.ksnd_fmbs, 0,
+                       sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                              SOCKNAL_LARGE_FWD_NMSGS));
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
+                                 SOCKNAL_LARGE_FWD_NMSGS); i++) {
+                        ksock_fmb_t *fmb =
+                                &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i];
+
+                        if (i < SOCKNAL_SMALL_FWD_NMSGS) {
+                                fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp;
+                        } else {
+                                fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp;
+                        }
+
+                        LASSERT (fmb->fmb_npages > 0);
+                        for (j = 0; j < fmb->fmb_npages; j++) {
+                                fmb->fmb_pages[j] = alloc_page (GFP_KERNEL);
+
+                                if (fmb->fmb_pages[j] == NULL) {
+                                        ksocknal_module_fini ();
+                                        return (-ENOMEM);
+                                }
+
+                                LASSERT(page_address (fmb->fmb_pages[j]) !=
+                                        NULL);
+                        }
+
+                        list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+                }
+        }
+
+        rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                ksocknal_module_fini ();
+                return (rc);
+        }
+
+        PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
+               "mem %d)\n",
+               kpr_routing (&ksocknal_data.ksnd_router) ?
+               "enabled" : "disabled", pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
+
+EXPORT_SYMBOL (ksocknal_ni);
diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h
new file mode 100644 (file)
index 0000000..46ee3b7
--- /dev/null
@@ -0,0 +1,293 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_N_SCHED num_online_cpus()       /* # socknal schedulers */
+
+#if PTL_LARGE_MTU
+# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10)      /* biggest payload I can forward */
+#else
+# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)       /* biggest payload I can forward */
+#endif
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        spinlock_t        fmp_lock;             /* serialise */
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+
+typedef struct                                  /* per scheduler state */
+{
+        spinlock_t        kss_lock;             /* serialise */
+        struct list_head  kss_rx_conns;         /* conn waiting to be read */
+        struct list_head  kss_tx_conns;         /* conn waiting to be written */
+#if SOCKNAL_ZC
+        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
+#endif
+        wait_queue_head_t kss_waitq;            /* where scheduler sleeps */
+        int               kss_nconns;           /* # connections assigned to this scheduler */
+} ksock_sched_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        ksock_sched_t    *ksnd_schedulers;      /* scheduler state */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        spinlock_t        ksnd_idle_ltx_lock;   /* serialise ltx alloc/free */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        unsigned char     ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+#define SOCKNAL_IRQ_BOUND       0x80            /* flag we _did_ bind already */
+#define SOCKNAL_IRQ_SCHED_MASK 0x7f            /* we assume < 127 CPUs */
+#define SOCKNAL_IRQ_UNASSIGNED  0xff            /* flag unassigned */
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments and 0 or more ptl_kiov_t fragments.  Forwarded
+ * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0
+ * ptl_kiov_t fragments.  Messages from an MD with PTL_MD_KIOV set, have 1
+ * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t
+ * fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, if the message
+ * requires forwarding or will be received into mapped memory, up to
+ * PTL_MD_MAX_IOV struct iovec fragments describe the target memory.
+ * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used.
+ */
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;        /* queue on conn for transmission etc */
+        char                    tx_isfwd;       /* forwarding / sourced here */
+        int                     tx_nob;         /* # packet bytes */
+        int                     tx_niov;        /* # packet iovec frags */
+        struct iovec           *tx_iov;         /* packet iovec frags */
+        int                     tx_nkiov;       /* # packet page frags */
+        ptl_kiov_t             *tx_kiov;        /* packet page frags */
+#if SOCKNAL_ZC        
+        ksock_sched_t          *tx_sched;       /* who to wake on callback */
+        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
+#endif
+} ksock_tx_t;
+
+#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the tx frag descriptors: hdr is always 1 iovec
+ * and payload is PTL_MD_MAX of either type. */
+typedef struct
+{
+        struct iovec            hdr;
+        union {
+                struct iovec    iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+        }                       payload;
+} ksock_txiovspace_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        ksock_txiovspace_t      ltx_iov_space;  /* where to stash frag descriptors */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the address of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or PTL_MD_MAX_IOV frags of payload of either type. */
+typedef union {
+        struct iovec    iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* actual socket */
+        void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+        void               *ksnc_saved_write_space; /* socket's original write_space() callback */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        volatile int        ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # iovec frags */
+        struct iovec       *ksnc_rx_iov;        /* the iovec frags */
+        int                 ksnc_rx_nkiov;      /* # page frags */
+        ptl_kiov_t         *ksnc_rx_kiov;       /* the page frags */
+        ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        volatile int        ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+
+} ksock_conn_t;
+
+extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client);
+extern int ksocknal_close_sock(ptl_nid_t nid);
+extern int ksocknal_set_mynid(ptl_nid_t nid);
+extern int ksocknal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid);
+extern void _ksocknal_put_conn (ksock_conn_t *conn);
+extern void ksocknal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ksocknal_put_conn (conn);
+}
+
+extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern void ksocknal_data_ready(struct sock *sk, int n);
+extern void ksocknal_write_space(struct sock *sk);
+
+
+extern nal_cb_t         ksocknal_lib;
+extern ksock_nal_data_t ksocknal_data;
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644 (file)
index 0000000..388554d
--- /dev/null
@@ -0,0 +1,1612 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+atomic_t   ksocknal_packets_received;
+atomic_t   ksocknal_packets_launched;
+atomic_t   ksocknal_packets_being_sent;
+
+#if SOCKNAL_ZC
+int        ksocknal_do_zc = 1;
+int        ksocknal_zc_min_frag = 2048;
+#endif
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                         ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ksocknal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ksocknal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ksocknal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ksocknal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ksocknal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ksocknal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ksocknal_get_ltx (int may_block)
+{
+        long             flags;
+        ksock_ltx_t *ltx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+                if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) {
+                        ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next,
+                                         ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) {
+                                ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next,
+                                                 ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock,
+                                       flags);
+
+                wait_event (ksocknal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ksocknal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        return (ltx);
+}
+
+#if SOCKNAL_ZC
+struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+int
+ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        struct iovec  *iov = tx->tx_iov;
+        int            fragsize = iov->iov_len;
+        unsigned long  vaddr = (unsigned long)iov->iov_base;
+#if SOCKNAL_ZC
+        int            offset = vaddr & (PAGE_SIZE - 1);
+        int            zcsize = MIN (fragsize, PAGE_SIZE - offset);
+        struct page   *page;
+#endif
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (tx->tx_niov > 0);
+        more |= (tx->tx_niov > 1);
+        
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            zcsize >= ksocknal_zc_min_frag &&
+            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
+                
+                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
+                       (void *)vaddr, page, page_address(page), offset, zcsize);
+
+                more |= (zcsize < fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, 
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* NB don't pass tx's iov; sendmsg may or may not update it */
+                struct iovec fragiov = { .iov_base = (void *)vaddr,
+                                         .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+        } 
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len  = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_iov++;
+        tx->tx_niov--;
+        return (1);
+}
+
+int
+ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        ptl_kiov_t    *kiov = tx->tx_kiov;
+        int            fragsize = kiov->kiov_len;
+        struct page   *page = kiov->kiov_page;
+        int            offset = kiov->kiov_offset;
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        LASSERT (tx->tx_nkiov > 0);
+        more |= (tx->tx_nkiov > 1);
+
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            fragsize >= ksocknal_zc_min_frag) {
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, fragsize,
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                char *addr = ((char *)kmap (page)) + offset;
+                struct iovec fragiov = {.iov_base = addr,
+                                        .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t  oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+                kunmap (page);
+        }
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len    = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_kiov++;
+        tx->tx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        int    rc;
+        int    sent_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt());
+
+        for (;;) {
+                if (tx->tx_niov != 0)
+                        rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0);
+                else
+                        rc = ksocknal_send_kiov (sock, tx, more);
+
+                /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */
+                if (rc <= 0)                    /* error or partial send */
+                        RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc);
+                
+                if (tx->tx_nob == 0)            /* sent everything */
+                        RETURN (0);
+
+                sent_some = 1;
+        }
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int           fragsize  = iov->iov_len;
+        unsigned long vaddr = (unsigned long)iov->iov_base;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+
+        if (rc <= 0)
+                return (rc);
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_iov++;
+        conn->ksnc_rx_niov--;
+        return (1);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct page  *page = kiov->kiov_page;
+        int           offset = kiov->kiov_offset;
+        int           fragsize = kiov->kiov_len;
+        unsigned long vaddr = ((unsigned long)kmap (page)) + offset;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+        kunmap (page);
+        
+        if (rc <= 0)
+                return (rc);
+        
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_kiov++;
+        conn->ksnc_rx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_recvmsg (ksock_conn_t *conn) 
+{
+        int    rc;
+        int    got_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt ());
+
+        for (;;) {
+                LASSERT (conn->ksnc_rx_nob_wanted > 0);
+                
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                /* CAVEAT EMPTOR: we return...
+                 * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */
+
+                if (rc <= 0)                    /* error/EOF or partial receive */
+                        RETURN ((got_some || rc == -EAGAIN) ? 1 : rc);
+                
+                if (conn->ksnc_rx_nob_wanted == 0)
+                        RETURN (1);
+
+                got_some = 0;
+        }
+}
+
+#if SOCKNAL_ZC
+void
+ksocknal_zc_callback (zccd_t *zcd)
+{
+        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
+        ksock_sched_t *sched = tx->tx_sched;
+        unsigned long  flags;
+        ENTRY;
+
+        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        EXIT;
+}
+#endif
+
+void
+ksocknal_tx_done (ksock_tx_t *tx)
+{
+        long           flags;
+        ksock_ltx_t   *ltx;
+        ENTRY;
+
+        atomic_dec (&ksocknal_packets_being_sent);
+
+        if (tx->tx_isfwd) {             /* was a forwarded packet? */
+                kpr_fwd_done (&ksocknal_data.ksnd_router,
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                EXIT;
+                return;
+        }
+
+        /* local send */
+        ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        list_add_tail (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+        /* normal tx desc => wakeup anyone blocking for one */
+        if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list &&
+            waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq))
+                wake_up (&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+        EXIT;
+}
+
+void
+ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_tx_t *tx;
+        int         rc;
+
+        LASSERT (!list_empty (&sched->kss_tx_conns));
+        conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
+        list_del (&conn->ksnc_tx_list);
+
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+        tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        /* assume transmit will complete now, so dequeue while I've got lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to write */
+
+        rc = ksocknal_sendmsg (conn->ksnc_sock, tx, 
+                               !list_empty (&conn->ksnc_tx_queue)); /* more to come? */
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc != 0) {
+#warning FIXME: handle socket errors properly
+                CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                /* kid on for now the whole packet went.
+                 * NB when we handle the error better, we'll still need to
+                 * block for zccd completion.
+                 */
+                tx->tx_nob = 0;
+        }
+
+        if (tx->tx_nob == 0)                    /* nothing left to send */
+        {
+                /* everything went; assume more can go, so prevent write_space locking */
+                conn->ksnc_tx_ready = 1;
+
+                ksocknal_put_conn (conn);       /* release packet's ref */
+                atomic_inc (&ksocknal_packets_being_sent);
+#if SOCKNAL_ZC
+                if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
+                        /* zccd skbufs are still in-flight.  Release my
+                         * initial ref on zccd, so callback can occur */
+                        zccd_put (&tx->tx_zccd);
+                } else
+#endif
+                        ksocknal_tx_done (tx);
+
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+        } else {
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                                 /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+}
+
+void
+ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        unsigned long  flags;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete portals header.
+         */
+        LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t));
+        
+        CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n",
+                ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, 
+                tx->tx_niov, tx->tx_nkiov);
+
+#if SOCKNAL_ZC
+        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
+        /* NB this sets 1 ref on zccd, so the callback can only occur
+         * after I've released this ref */
+        tx->tx_sched = sched;
+#endif
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled) {          /* not scheduled to send */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&sched->kss_waitq))
+                        wake_up (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        atomic_inc (&ksocknal_packets_launched);
+}
+
+ksock_conn_t *
+ksocknal_send_target (ptl_nid_t nid) 
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        int           rc;
+
+        if ((conn = ksocknal_get_conn (nid)) == NULL) {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        return (NULL);
+                }
+
+                if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64
+                                " is not a peer\n", nid, gatewaynid);
+                        return (NULL);
+                }
+        }
+
+        return (conn);
+}
+
+ksock_ltx_t *
+ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                    ptl_hdr_t *hdr, int type)
+{
+        ksock_ltx_t  *ltx;
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt ()));
+        if (ltx == NULL) {
+                CERROR ("Can't allocate tx desc\n");
+                return (NULL);
+        }
+
+        /* Init local send packet (storage for hdr, finalize() args) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+        
+        /* Init common ltx_tx */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr);
+
+        /* We always have 1 mapped frag for the header */
+        ltx->ltx_tx.tx_niov = 1;
+        ltx->ltx_tx.tx_iov = &ltx->ltx_iov_space.hdr;
+        ltx->ltx_tx.tx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        ltx->ltx_tx.tx_kiov  = NULL;
+        ltx->ltx_tx.tx_nkiov = 0;
+
+        return (ltx);
+}
+
+int
+ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t  *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it 
+         *
+         * Also, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL) {
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+        
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+        
+        /* append the payload_iovs to the one pointing at the header */
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+int
+ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL)
+                return (-1);
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                return (-1);
+        }
+
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov;
+        memcpy (ltx->ltx_tx.tx_kiov, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_nkiov = payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd,
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        /* I'm the gateway; must be the last hop */
+        if (nid == ksocknal_lib.ni.nid)
+                nid = fwd->kprfd_target_nid;
+
+        conn = ksocknal_get_conn (nid);
+        if (conn == NULL) {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                   /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+        tx->tx_nkiov = 0;
+        tx->tx_kiov  = NULL;
+        
+        ksocknal_launch_packet (conn, tx);
+}
+
+int
+ksocknal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ksocknal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ksocknal_thread_fini (void)
+{
+        atomic_dec (&ksocknal_data.ksnd_nthreads);
+}
+
+void
+ksocknal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn = NULL;
+        ksock_sched_t     *sched;
+        long               flags;
+
+        if (error != 0)
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),
+                       error);
+        else
+                CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
+                        NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+
+        spin_lock_irqsave (&fmp->fmp_lock, flags);
+
+        list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
+
+        if (!list_empty (&fmp->fmp_blocked_conns)) {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next,
+                                   ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+        }
+
+        spin_unlock_irqrestore (&fmp->fmp_lock, flags);
+
+        if (conn == NULL)
+                return;
+
+        CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+}
+
+ksock_fmb_t *
+ksocknal_get_idle_fmb (ksock_conn_t *conn)
+{
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        long              flags;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (ksocknal_data.ksnd_fmbs != NULL);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ksocknal_data.ksnd_small_fmp;
+        else
+                pool = &ksocknal_data.ksnd_large_fmp;
+
+        spin_lock_irqsave (&pool->fmp_lock, flags);
+
+        if (!list_empty (&pool->fmp_idle_fmbs)) {
+                fmb = list_entry(pool->fmp_idle_fmbs.next,
+                                 ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                spin_unlock_irqrestore (&pool->fmp_lock, flags);
+
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+
+        spin_unlock_irqrestore (&pool->fmp_lock, flags);
+        return (NULL);
+}
+
+
+int
+ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int niov;                               /* at least the header */
+        int nob;
+
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+
+        /* copy header */
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t));
+
+        if (payload_nob == 0) {         /* got complete packet already */
+                atomic_inc (&ksocknal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                              packet_nob, 1, fmb->fmb_iov,
+                              ksocknal_fmb_callback, fmb);
+
+                /* forward it now */
+                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE) {  /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        } else {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+
+                do {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base =
+                                page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                      packet_nob, niov, fmb->fmb_iov,
+                      ksocknal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
+                 sizeof (struct iovec));
+
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        conn->ksnc_rx_iov[0].iov_base =
+                (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) +
+                         sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len =
+                fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1],
+                       (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ksocknal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        ptl_nid_t     dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int           body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr));
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid),
+                dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        if (body_len < 0) {                 /* length corrupt (overflow) */
+                CERROR("dropping packet from "LPX64" for "LPX64": packet "
+                       "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid),
+                       dest_nid, body_len);
+                ksocknal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (ksocknal_data.ksnd_fmbs == NULL) {        /* not forwarding */
+                CERROR("dropping packet from "LPX64" for "LPX64": not "
+                       "forwarding\n", conn->ksnc_hdr.src_nid,
+                       conn->ksnc_hdr.dest_nid);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) {      /* too big to forward */
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": packet size %d too big\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid, body_len);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        /* should have gone direct */
+        conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid);
+        if (conn2 != NULL) {
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": target is a peer\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid);
+                ksocknal_put_conn (conn2);  /* drop ref from get above */
+
+                /* on to next packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                return (1);
+        }
+
+        /* Set up to skip as much a possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_fmb_t  *fmb;
+        int           rc;
+
+        /* NB: sched->ksnc_lock lock held */
+
+        LASSERT (!list_empty (&sched->kss_rx_conns));
+        conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list);
+        list_del (&conn->ksnc_rx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        CDEBUG(D_NET, "sched %p conn %p\n", sched, conn);
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* doesn't need a forwarding buffer */
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)
+                goto try_read;
+
+ get_fmb:
+        fmb = ksocknal_get_idle_fmb (conn);
+        if (fmb == NULL) {      /* conn descheduled waiting for idle fmb */
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+                return;
+        }
+
+        if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to read */
+
+        rc = ksocknal_recvmsg(conn);
+
+        if (rc == 0)
+                goto out;
+        if (rc < 0) {
+#warning FIXME: handle socket errors properly
+                CERROR ("Error socknal read %p: %d\n", conn, rc);
+                goto out;
+        }
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        /* got all I wanted, assume there's more - prevent data_ready locking */
+        conn->ksnc_rx_ready = 1;
+
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_HEADER:
+                /* It's not for me */
+                if (conn->ksnc_hdr.type != PTL_MSG_HELLO &&
+                    NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) {
+                        ksocknal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state) {
+                        case SOCKNAL_RX_HEADER: /* skipped (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                                LBUG ();
+                        }
+                        /* Not Reached */
+                }
+
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc */
+                lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ksocknal_packets_received);
+                /* packet is done now */
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        goto out;       /* come back later */
+                goto try_read;          /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        NTOH__u64 (conn->ksnc_hdr.dest_nid),
+                        conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ksocknal_packets_received);
+
+                /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */
+                kpr_fwd_start (&ksocknal_data.ksnd_router,
+                               (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                /* no slop in forwarded packets */
+                LASSERT (conn->ksnc_rx_nob_left == 0);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+        /* no data there to read? */
+        if (!conn->ksnc_rx_ready) {
+                /* let socket callback schedule again */
+                conn->ksnc_rx_scheduled = 0;
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                              /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+}
+
+int
+ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
+               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int
+ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_niov = 0;
+        conn->ksnc_rx_iov  = NULL;
+        conn->ksnc_rx_nkiov = niov;
+        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int ksocknal_scheduler (void *arg)
+{
+        ksock_sched_t     *sched = (ksock_sched_t *)arg;
+        unsigned long      flags;
+        int                rc;
+        int                nloops = 0;
+        int                id = sched - ksocknal_data.ksnd_schedulers;
+        char               name[16];
+#if (CONFIG_SMP && CPU_AFFINITY)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        int                cpu = cpu_logical_map(id % num_online_cpus());
+#else
+#warning "Take care of architecure specific logical APIC map"
+        int cpu = 1;    /* Have to change later. */
+#endif /* LINUX_VERSION_CODE */
+        
+        set_cpus_allowed (current, 1 << cpu);
+        id = cpu;
+#endif /* CONFIG_SMP && CPU_AFFINITY */
+
+        snprintf (name, sizeof (name),"ksocknald[%d]", id);
+        kportal_daemonize (name);
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&sched->kss_rx_conns)) {
+                        did_something = 1;
+                        /* drops & regains kss_lock */
+                        ksocknal_process_receive (sched, &flags);
+                }
+
+                if (!list_empty (&sched->kss_tx_conns)) {
+                        did_something = 1;
+                        /* drops and regains kss_lock */
+                        ksocknal_process_transmit (sched, &flags);
+                }
+#if SOCKNAL_ZC
+                if (!list_empty (&sched->kss_zctxdone_list)) {
+                        ksock_tx_t *tx =
+                                list_entry(sched->kss_zctxdone_list.next,
+                                           ksock_tx_t, tx_list);
+                        did_something = 1;
+
+                        list_del (&tx->tx_list);
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        ksocknal_tx_done (tx);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+#endif
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+#if SOCKNAL_ZC
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns) ||
+                                                               !list_empty(&sched->kss_zctxdone_list));
+#else
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns));
+#endif
+                                LASSERT (rc == 0);
+                        } else
+                               our_cond_resched();
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->data_ready != &ksocknal_data_ready);
+                sk->data_ready (sk, n);
+        } else if (!conn->ksnc_rx_ready) {        /* new news */
+                /* Set ASAP in case of concurrent calls to me */
+                conn->ksnc_rx_ready = 1;
+
+                sched = conn->ksnc_scheduler;
+
+                spin_lock_irqsave (&sched->kss_lock, flags);
+
+                /* Set again (process_receive may have cleared while I blocked for the lock) */
+                conn->ksnc_rx_ready = 1;
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&sched->kss_waitq))
+                                wake_up (&sched->kss_waitq);
+                }
+
+                spin_unlock_irqrestore (&sched->kss_lock, flags);
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        EXIT;
+}
+
+void
+ksocknal_write_space (struct sock *sk)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
+               (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->write_space != &ksocknal_write_space);
+                sk->write_space (sk);
+        } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+                clear_bit (SOCK_NOSPACE, &sk->socket->flags);
+
+                if (!conn->ksnc_tx_ready) {      /* new news */
+                        /* Set ASAP in case of concurrent calls to me */
+                        conn->ksnc_tx_ready = 1;
+
+                        sched = conn->ksnc_scheduler;
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        /* Set again (process_transmit may have
+                           cleared while I blocked for the lock) */
+                        conn->ksnc_tx_ready = 1;
+
+                        if (!conn->ksnc_tx_scheduled && // not being progressed
+                            !list_empty(&conn->ksnc_tx_queue)){//packets to send
+                                list_add_tail (&conn->ksnc_tx_list,
+                                               &sched->kss_tx_conns);
+                                conn->ksnc_tx_scheduled = 1;
+                                /* extra ref for scheduler */
+                                atomic_inc (&conn->ksnc_refcount);
+
+                                if (waitqueue_active (&sched->kss_waitq))
+                                        wake_up (&sched->kss_waitq);
+                        }
+
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ksocknal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ksocknal_data.ksnd_reaper_list)) {
+                        conn = NULL;
+                } else {
+                        conn = list_entry (ksocknal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ksocknal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq,
+                                                       ksocknal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ksocknal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t ksocknal_lib = {
+        nal_data:       &ksocknal_data,                /* NAL private data */
+        cb_send:         ksocknal_send,
+        cb_send_pages:   ksocknal_send_pages,
+        cb_recv:         ksocknal_recv,
+        cb_recv_pages:   ksocknal_recv_pages,
+        cb_read:         ksocknal_read,
+        cb_write:        ksocknal_write,
+        cb_callback:     ksocknal_callback,
+        cb_malloc:       ksocknal_malloc,
+        cb_free:         ksocknal_free,
+        cb_printf:       ksocknal_printf,
+        cb_cli:          ksocknal_cli,
+        cb_sti:          ksocknal_sti,
+        cb_dist:         ksocknal_dist
+};
diff --git a/lnet/klnds/toelnd/Makefile.am b/lnet/klnds/toelnd/Makefile.am
new file mode 100644 (file)
index 0000000..9bfff64
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ktoenal
+modulenet_DATA = ktoenal.o
+EXTRA_PROGRAMS = ktoenal
+
+DEFS =
+ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h
diff --git a/lnet/klnds/toelnd/toenal.c b/lnet/klnds/toelnd/toenal.c
new file mode 100644 (file)
index 0000000..178ea41
--- /dev/null
@@ -0,0 +1,629 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <linux/poll.h>
+#include "toenal.h"
+
+ptl_handle_ni_t         ktoenal_ni;
+static nal_t            ktoenal_api;
+static ksock_nal_data_t ktoenal_data;
+
+/*
+ksocknal_interface_t ktoenal_interface = {
+        ksni_add_sock:         ktoenal_add_sock,
+        ksni_close_sock:       ktoenal_close_sock,
+        ksni_set_mynid:                ktoenal_set_mynid,
+};
+*/
+
+kpr_nal_interface_t ktoenal_router_interface = {
+        kprni_nalid:   TOENAL,
+        kprni_arg:     &ktoenal_data,
+        kprni_fwd:     ktoenal_fwd_packet,
+};
+
+
+int
+ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */
+        return PTL_OK;
+}
+
+int
+ktoenal_api_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "closing all connections\n");
+
+        return ktoenal_close_sock(0);          /* close all sockets */
+}
+
+void
+ktoenal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ktoenal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ktoenal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ktoenal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ktoenal_data.ksnd_mynid);
+        lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ktoenal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ktoenal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ktoenal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid);
+
+        ktoenal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+int
+ktoenal_add_sock (ptl_nid_t nid, int fd)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        int                ret;
+        ENTRY;
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+        file->f_flags |= O_NONBLOCK;  /*  Does this have any conflicts */
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ktoenal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist);
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        ktoenal_data_ready(conn);
+        ktoenal_write_space(conn);
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+        /* Schedule pollthread so that it will poll
+         * for newly created socket
+         */
+
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ktoenal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0)                           /* close ALL connections */
+        {
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ktoenal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ktoenal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (list_empty (&death_row))
+                return (-ENOENT);
+
+        do {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+                ktoenal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        } while (!list_empty (&death_row));
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+        return (0);
+}
+
+
+ksock_conn_t *
+ktoenal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ktoenal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ktoenal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ktoenal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt())
+        {
+                ktoenal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list);
+        wake_up (&ktoenal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+}
+
+void
+ktoenal_free_buffers (void)
+{
+        if (ktoenal_data.ksnd_fmbs != NULL)
+        {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ktoenal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ktoenal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ktoenal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+}
+
+int
+ktoenal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ktoenal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ktoenal_set_mynid (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+
+void __exit
+ktoenal_module_fini (void)
+{
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ktoenal_data.ksnd_init)
+        {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(TOENAL);
+                PORTAL_SYMBOL_UNREGISTER (ktoenal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ktoenal_ni);
+                lib_fini(&ktoenal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ktoenal_data.ksnd_socklist));
+                LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ktoenal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ktoenal_data.ksnd_reaper_waitq);
+                wake_up_all (&ktoenal_data.ksnd_sched_waitq);
+                wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+                while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0)
+                {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ktoenal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ktoenal_data.ksnd_router);
+
+                ktoenal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+int __init
+ktoenal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ktoenal_api.forward  = ktoenal_api_forward;
+        ktoenal_api.shutdown = ktoenal_api_shutdown;
+        ktoenal_api.yield    = ktoenal_api_yield;
+        ktoenal_api.validate = NULL;           /* our api validate is a NOOP */
+        ktoenal_api.lock     = ktoenal_api_lock;
+        ktoenal_api.unlock   = ktoenal_api_unlock;
+        ktoenal_api.nal_data = &ktoenal_data;
+
+        ktoenal_lib.nal_data = &ktoenal_data;
+
+        memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist);
+        rwlock_init(&ktoenal_data.ksnd_socklist_lock);
+
+        ktoenal_data.ksnd_nal_cb = &ktoenal_lib;
+        spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init (&ktoenal_data.ksnd_sched_lock);
+
+        init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns);
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq);
+        spin_lock_init (&ktoenal_data.ksnd_reaper_lock);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_fmbs,
+                     sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        if (ktoenal_data.ksnd_fmbs == NULL)
+                RETURN(-ENOMEM);
+
+        /* NULL out buffer pointers etc */
+        memset(ktoenal_data.ksnd_fmbs, 0,
+               sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+
+        for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++)
+        {
+                ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i];
+
+                if (i < SOCKNAL_SMALL_FWD_NMSGS)
+                {
+                        fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp;
+                }
+                else
+                {
+                        fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp;
+                }
+
+                LASSERT (fmb->fmb_npages > 0);
+                for (j = 0; j < fmb->fmb_npages; j++)
+                {
+                        fmb->fmb_pages[j] = alloc_page (GFP_KERNEL);
+
+                        if (fmb->fmb_pages[j] == NULL)
+                        {
+                                ktoenal_module_fini ();
+                                return (-ENOMEM);
+                        }
+
+                        LASSERT (page_address (fmb->fmb_pages[j]) != NULL);
+                }
+
+                list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+        }
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_ltxs,
+                     sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+        if (ktoenal_data.ksnd_ltxs == NULL)
+        {
+                ktoenal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ktoenal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++)
+        {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ktoenal_data.ksnd_idle_ltx_list :
+                                &ktoenal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni);
+        if (rc != 0)
+        {
+                CERROR("ktoenal: PtlNIInit failed: error %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ktoenal_ni, ~0);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */
+
+        ktoenal_data.ksnd_slistchange = 1;
+        for (i = 0; i < TOENAL_N_SCHED; i++)
+        {
+                rc = ktoenal_thread_start (ktoenal_scheduler, NULL);
+                if (rc != 0)
+                {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc);
+                        ktoenal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ktoenal_thread_start (ktoenal_reaper, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = ktoenal_thread_start (ktoenal_pollthread, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal pollthread: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ktoenal_data.ksnd_router,
+                  &ktoenal_router_interface);
+        if (rc != 0)
+                CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc);
+
+        rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL);
+        if (rc != 0)
+                CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n",
+                       rc);
+
+        PORTAL_SYMBOL_REGISTER(ktoenal_ni);
+
+        /* flag everything initialised */
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+              kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
+               pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ktoenal_module_init);
+module_exit(ktoenal_module_fini);
+
+EXPORT_SYMBOL (ktoenal_ni);
diff --git a/lnet/klnds/toelnd/toenal.h b/lnet/klnds/toelnd/toenal.h
new file mode 100644 (file)
index 0000000..f793d3b
--- /dev/null
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/sched.h> 
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)        /* biggest payload I can forward */
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 32              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+#define TOENAL_N_SCHED 1
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        spinlock_t        ksnd_sched_lock;      /* serialise packet scheduling */
+        wait_queue_head_t ksnd_sched_waitq;     /* where scheduler(s) wait */
+
+        struct list_head  ksnd_rx_conns;        /* conn waiting to be read */
+        struct list_head  ksnd_tx_conns;        /* conn waiting to be written */
+        
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        
+        struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */
+        poll_table          ksnd_pwait;         /* poll wait table for the socket */
+        int                 ksnd_slistchange;   /* informs the pollthread that
+                                                 * the socklist has changed */  
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;       /* queue on conn for transmission etc */
+        char                    tx_isfwd;      /* forwarding / sourced here */
+        int                     tx_nob;        /* # packet bytes */
+        int                     tx_niov;       /* # packet frags */
+        struct iovec           *tx_iov;        /* packet frags */
+} ksock_tx_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        struct iovec            ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the addres of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* socket */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        unsigned long       ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # frags */
+        struct iovec        ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */
+
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        unsigned long       ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+        
+} ksock_conn_t;
+
+extern int ktoenal_add_sock (ptl_nid_t nid, int fd);
+extern int ktoenal_close_sock(ptl_nid_t nid);
+extern int ktoenal_set_mynid(ptl_nid_t nid);
+extern int ktoenal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid);
+extern void _ktoenal_put_conn (ksock_conn_t *conn);
+extern void ktoenal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ktoenal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ktoenal_put_conn (conn);
+}
+
+extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ktoenal_new_packet (ksock_conn_t *conn, int skip);
+extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ktoenal_scheduler (void *arg);
+extern int ktoenal_reaper (void *arg);
+extern int ktoenal_pollthread (void *arg);
+extern void ktoenal_data_ready(ksock_conn_t *conn);
+extern void ktoenal_write_space(ksock_conn_t *conn);
+
+
+extern nal_cb_t         ktoenal_lib;
+extern ksock_nal_data_t ktoenal_data;
diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c
new file mode 100644 (file)
index 0000000..8270196
--- /dev/null
@@ -0,0 +1,1220 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *   
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/poll.h>
+#include "toenal.h"
+
+atomic_t   ktoenal_packets_received;
+long       ktoenal_packets_launched;
+long       ktoenal_packets_transmitted;
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int 
+ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL) 
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ktoenal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ktoenal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ktoenal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+       va_list ap;
+       char msg[256];
+
+       va_start (ap, fmt);
+       vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+       va_end (ap);
+
+       msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ktoenal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ktoenal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ktoenal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ktoenal_get_ltx (int may_block)
+{
+        long        flags;
+        ksock_ltx_t *ltx = NULL;
+        
+        for (;;)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+                if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list))
+                {
+                        ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block)
+                {
+                        if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list))
+                        {
+                                ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, 
+                                                  ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+                
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+                
+                wait_event (ktoenal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ktoenal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+        return (ltx);
+}
+
+int
+ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags)
+{
+        /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't)
+         */
+        mm_segment_t oldmm;
+        int           rc;
+
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        oldmm = get_fs();
+        set_fs (KERNEL_DS);
+
+#ifdef PORTAL_DEBUG
+        {
+                int total_nob;
+                int i;
+                
+                for (i = total_nob = 0; i < niov; i++)
+                        total_nob += iov[i].iov_len;
+                
+                LASSERT (nob == total_nob);
+        }
+#endif        
+        LASSERT (!in_interrupt());
+       
+        rc = sock->f_op->writev(sock, iov, niov, NULL);
+
+        set_fs (oldmm);
+
+        if (rc > 0)                             /* sent something? */
+        {
+                nob = rc;                       /* consume iov */
+                for (;;)
+                {
+                        LASSERT (niov > 0);
+                        
+                        if (iov->iov_len >= nob)
+                        {
+                                iov->iov_len -= nob;
+                                iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob);
+                                break;
+                        }
+                        nob -= iov->iov_len;
+                        iov->iov_len = 0;
+                        iov++;
+                        niov--;
+                }
+        }
+
+        return (rc);
+}
+
+int
+ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread)
+{
+        /* NB This procedure "consumes" iov (actually tcp_recvmsg does)
+         */
+        mm_segment_t oldmm;
+        int ret, i, len = 0, origlen = 0;
+        
+        PROF_START(our_recvmsg);
+        for(i = 0; i < niov; i++) {
+                len += iov[i].iov_len;
+                if(len >= toread)
+                        break;
+        }
+
+        if(len >= toread) {
+                origlen = iov[i].iov_len;
+                iov[i].iov_len -= (len - toread);
+        }
+        else {  /* i == niov */
+                i = niov - 1;
+        }
+
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+
+        ret = sock->f_op->readv(sock, iov, i + 1, NULL);
+        
+        set_fs(oldmm);
+
+        if(origlen)
+                iov[i].iov_len = origlen;
+
+        PROF_FINISH(our_recvmsg);
+        return ret;
+}
+
+void
+ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        int         rc;
+        
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+
+        /* assume transmit will complete now, so dequeue while I've got the lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;                /* write_space may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to write */
+
+        rc = ktoenal_sendmsg (conn->ksnc_file,
+                               tx->tx_iov, tx->tx_niov, tx->tx_nob,
+                               list_empty (&conn->ksnc_tx_queue) ? 
+                               MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE));
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc < 0)                             /* error */
+        {
+                if (rc == -EAGAIN)              /* socket full => */
+                        rc = 0;                 /* nothing sent */
+                else
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                        rc = tx->tx_nob;        /* kid on for now whole packet went */
+                }
+        }
+
+        if (rc == tx->tx_nob)                   /* everything went */
+        {
+                conn->ksnc_tx_ready = 1;        /* assume more can go (ASAP) */
+                ktoenal_put_conn (conn);       /* release packet's ref */
+
+                if (tx->tx_isfwd)               /* was a forwarded packet? */
+                {
+                        kpr_fwd_done (&ktoenal_data.ksnd_router,
+                                      KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                }
+                else                            /* local send */
+                {
+                        ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+                        lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                        
+                        list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+                        /* normal tx desc => wakeup anyone blocking for one */
+                        if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list &&
+                            waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq))
+                                wake_up (&ktoenal_data.ksnd_idle_ltx_waitq);
+                }
+                ktoenal_packets_transmitted++;
+        }
+        else
+        {
+                tx->tx_nob -= rc;
+
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue))  /* nothing to write */
+        {
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+}
+
+void
+ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        long          flags;
+        int           nob = tx->tx_nob;
+        struct iovec *iov = tx->tx_iov;
+        int           niov = 1;
+        
+        LASSERT (nob >= sizeof (ptl_hdr_t));
+
+        /* Truncate iov to exactly match total packet length
+         * since socket sendmsg pays no attention to requested length.
+         */
+        for (;;)
+        {
+                LASSERT (niov <= tx->tx_niov);
+                LASSERT (iov->iov_len >= 0);
+                
+                if (iov->iov_len >= nob)
+                {
+                        iov->iov_len = nob;
+                        break;
+                }
+                nob -= iov->iov_len;
+                iov++;
+                niov++;
+        }
+        tx->tx_niov = niov;
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled)           /* not scheduled to send */
+        {
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        ktoenal_packets_launched++;
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+int
+ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie,
+              ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+              unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        ksock_ltx_t  *ltx;
+        int           rc;
+        int           i;
+
+        /* By this point, as it happens, we have absolutely no idea what
+         * 'private' is.  It might be ksock_nal_data or it might be ksock_conn.
+         * Ha ha, isn't that a funny joke?
+         *
+         * FIXME: this is not the right way to fix this; the right way is to
+         * always pass in the same kind of structure.  This is hard right now.
+         * To revisit this issue, set a breakpoint in here and watch for when
+         * it's called from lib_finalize.  I think this occurs when we send a
+         * packet as a side-effect of another packet, such as when an ACK has
+         * been requested. -phil */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes from [%d](%p,%d)... to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov,
+               payload_niov > 0 ? payload_iov[0].iov_base : NULL,
+               payload_niov > 0 ? payload_iov[0].iov_len  : 0,
+               nid, pid);
+
+        if ((conn = ktoenal_get_conn (nid)) == NULL)
+        {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0)
+                {
+                        CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
+                        return (-1);
+                }
+
+                if ((conn = ktoenal_get_conn (gatewaynid)) == NULL)
+                {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", 
+                                nid, gatewaynid);
+                        return (-1);
+                }
+        }
+
+        /* This transmit has now got a ref on conn */
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK ||
+                                 type == PTL_MSG_REPLY ||
+                                 in_interrupt ()));
+        if (ltx == NULL)
+        {
+                CERROR ("Can't allocate tx desc\n");
+                ktoenal_put_conn (conn);
+                return (-1);
+        }
+        
+        /* Init common (to sends and forwards) packet part */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_iov = ltx->ltx_iov;
+
+        /* Init local send packet (storage for hdr, finalize() args, iov) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+
+        ltx->ltx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        for (i = 0; i < payload_niov; i++)
+        {
+                ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base;
+                ltx->ltx_iov[1 + i].iov_len  = payload_iov[i].iov_len;
+        }
+
+        ktoenal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, 
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        if (nid == ktoenal_lib.ni.nid)         /* I'm the gateway; must be the last hop */
+                nid = fwd->kprfd_target_nid;
+        
+        conn = ktoenal_get_conn (nid);
+        if (conn == NULL)
+        {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                       /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+
+        ktoenal_launch_packet (conn, tx);
+}
+
+int
+ktoenal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ktoenal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ktoenal_thread_fini (void)
+{
+        atomic_dec (&ktoenal_data.ksnd_nthreads);
+}
+
+void
+ktoenal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn;
+        long               flags;
+
+        CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", 
+                hdr->src_nid, hdr->dest_nid, error);
+
+        if (error != 0)
+                CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", 
+                        hdr->src_nid, hdr->dest_nid, error);
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+        list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+
+        if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns))
+        {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+
+                CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+                LASSERT (conn->ksnc_rx_scheduled);
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+                conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+ksock_fmb_t *
+ktoenal_get_idle_fmb (ksock_conn_t *conn)
+{
+        /* NB called with sched lock held */
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+        
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ktoenal_data.ksnd_small_fmp;
+        else
+                pool = &ktoenal_data.ksnd_large_fmp;
+        
+        if (!list_empty (&pool->fmp_idle_fmbs))
+        {
+                fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+        return (NULL);
+}
+
+
+int
+ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        int niov;                               /* at least the header */
+        int nob;
+        
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+        
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+                
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */
+
+        if (payload_nob == 0)                   /* got complete packet already */
+        {
+                atomic_inc (&ktoenal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                              packet_nob, 1, fmb->fmb_iov, 
+                              ktoenal_fmb_callback, fmb);
+
+                kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE)            /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        else
+        {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+                
+                do
+                {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                      packet_nob, niov, fmb->fmb_iov, 
+                      ktoenal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */        
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ktoenal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        int           body_len;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        switch (conn->ksnc_hdr.type)
+        {
+        case PTL_MSG_GET:
+        case PTL_MSG_ACK:
+                body_len = 0;
+                break;
+        case PTL_MSG_PUT:
+                body_len = conn->ksnc_hdr.msg.put.length;
+                break;
+        case PTL_MSG_REPLY:
+                body_len = conn->ksnc_hdr.msg.reply.length;
+                break;
+        default:
+                /* Unrecognised packet type */
+                CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n",
+                        conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                /* Ignore this header and go back to reading a new packet. */
+                ktoenal_new_packet (conn, 0);
+                return;
+        }
+
+        if (body_len < 0)                               /* length corrupt */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD)         /* too big to forward */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, body_len);    /* on to new packet (skip this one's body) */
+                return;
+        }
+
+        conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */
+        if (conn2 != NULL)
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                ktoenal_put_conn (conn2);          /* drop ref from get above */
+
+                ktoenal_new_packet (conn, body_len);  /* on to next packet (skip this one's body) */
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ktoenal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0)                   /* right at next packet boundary now */
+        {
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+                return (1);
+        }
+
+        /* set up to skip as much a possible now */
+        /* if there's more left (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        skipped = 0;
+        niov = 0;
+
+        do
+        {
+                nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&            /* mustn't overflow conn's rx iov */
+                 niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_fmb_t *fmb;
+        int          len;
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* NB: sched lock held */
+        CDEBUG(D_NET, "conn %p\n", conn);
+
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)     /* doesn't need a forwarding buffer */
+        {
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                goto try_read;
+        }
+
+ get_fmb:
+        /* NB: sched lock held */
+        fmb = ktoenal_get_idle_fmb (conn);
+        if (fmb == NULL)                        /* conn descheduled waiting for idle fmb */
+                return;
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+        
+        if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;                /* data ready may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to read */
+
+        /* NB ktoenal_recvmsg "consumes" the iov passed to it */
+        len = ktoenal_recvmsg(conn->ksnc_file,
+                               conn->ksnc_rx_iov, conn->ksnc_rx_niov,
+                               conn->ksnc_rx_nob_wanted);
+        CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len);
+
+        if (len <= 0)                           /* nothing ready (EAGAIN) or EOF or error */
+        {
+                if (len != -EAGAIN &&           /* ! nothing to read now */
+                    len != 0)                   /* ! nothing to read ever */
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal read(%d) %p: %d\n",
+                                conn->ksnc_rx_nob_wanted, conn, len);
+                }
+                goto out;                       /* come back when there's data ready */
+        }
+
+        LASSERT (len <= conn->ksnc_rx_nob_wanted);
+        conn->ksnc_rx_nob_wanted -= len;
+        conn->ksnc_rx_nob_left -= len;
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        conn->ksnc_rx_ready = 1;                /* assume there's more to be had */
+
+        switch (conn->ksnc_rx_state)
+        {
+        case SOCKNAL_RX_HEADER:
+                if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */
+                {
+                        ktoenal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state)
+                        {
+                        case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping this packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                        }
+                        /* Not Reached */
+                        LBUG ();
+                }
+
+                PROF_START(lib_parse);
+                lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */
+                {
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ktoenal_packets_received);
+                lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */
+                        goto out;               /* come back later */
+                goto try_read;                  /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ktoenal_packets_received);
+
+                /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */
+                kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        if (!conn->ksnc_rx_ready)               /* no data there to read? */
+        {
+                conn->ksnc_rx_scheduled = 0;    /* let socket callback schedule again */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+}
+
+int
+ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg,
+             unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+        int           i;
+
+        conn->ksnc_cookie = msg;
+
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        for (i = 0; i < niov; i++)
+        {
+                conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len;
+                conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base;
+        }
+
+        conn->ksnc_rx_niov       = niov;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        return (rlen);
+}
+
+int
+ktoenal_scheduler (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        int                nloops = 0;
+
+        kportal_daemonize ("ktoenal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&ktoenal_data.ksnd_rx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_rx_conns.next,
+                                           ksock_conn_t, ksnc_rx_list);
+                        list_del (&conn->ksnc_rx_list);
+
+                        ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */
+                }
+
+                if (!list_empty (&ktoenal_data.ksnd_tx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_tx_conns.next,
+                                           ksock_conn_t, ksnc_tx_list);
+
+                        list_del (&conn->ksnc_tx_list);
+                        ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */
+                }
+
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */
+                {
+                        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+                                rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq,
+                                                               ktoenal_data.ksnd_shuttingdown ||
+                                                               !list_empty (&ktoenal_data.ksnd_rx_conns) ||
+                                                               !list_empty (&ktoenal_data.ksnd_tx_conns));
+                                LASSERT (rc == 0);
+                        } else 
+                                our_cond_resched();
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+
+int
+ktoenal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ktoenal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ktoenal_data.ksnd_reaper_list))
+                        conn = NULL;
+                else
+                {
+                        conn = list_entry (ktoenal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ktoenal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq,
+                                                       ktoenal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ktoenal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+#define POLLREAD        (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)
+#define POLLWRITE       (POLLOUT | POLLWRNORM | POLLWRBAND)
+
+int
+ktoenal_pollthread(void *arg)
+{
+        unsigned int mask;
+        struct list_head *tmp;
+        ksock_conn_t *conn;
+        
+        /* Save the task struct for waking it up */
+        ktoenal_data.ksnd_pollthread_tsk = current; 
+        
+        kportal_daemonize ("ktoenal_pollthread");
+        kportal_blockallsigs ();
+        
+        poll_initwait(&ktoenal_data.ksnd_pwait);
+        
+        while(!ktoenal_data.ksnd_shuttingdown) {
+                
+                set_current_state(TASK_INTERRUPTIBLE);
+                
+                read_lock (&ktoenal_data.ksnd_socklist_lock);
+                list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+                        
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        atomic_inc(&conn->ksnc_refcount);
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                        
+                        mask = conn->ksnc_file->f_op->poll(conn->ksnc_file,
+                                  ktoenal_data.ksnd_slistchange ? 
+                                  &ktoenal_data.ksnd_pwait : NULL);
+                         
+                        if(mask & POLLREAD) {
+                                ktoenal_data_ready(conn);
+                                                        
+                        } 
+                        if (mask & POLLWRITE) {
+                                ktoenal_write_space(conn);  
+                              
+                        }
+                        if (mask & (POLLERR | POLLHUP)) {
+                                         /* Do error processing */          
+                        }      
+                        
+                        read_lock (&ktoenal_data.ksnd_socklist_lock);
+                        if(atomic_dec_and_test(&conn->ksnc_refcount))
+                                _ktoenal_put_conn(conn);
+                }
+                ktoenal_data.ksnd_slistchange = 0;
+                read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                
+                schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+                if(ktoenal_data.ksnd_slistchange) {
+                        poll_freewait(&ktoenal_data.ksnd_pwait); 
+                        poll_initwait(&ktoenal_data.ksnd_pwait);
+                }
+         }
+        poll_freewait(&ktoenal_data.ksnd_pwait);
+        ktoenal_thread_fini();
+        return (0);
+}
+
+void
+ktoenal_data_ready (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+        ENTRY;
+
+        if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { 
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail (&conn->ksnc_rx_list, 
+                                        &ktoenal_data.ksnd_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        /* This is done to avoid the effects of a sequence
+                         * of events in which the rx_ready is lost
+                         */
+                        conn->ksnc_rx_ready=1;
+                          
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+
+        EXIT;
+}
+
+void
+ktoenal_write_space (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+
+        CDEBUG (D_NET, "conn %p%s%s%s\n",
+                         conn,
+                        (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"),
+                        (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"),
+                        (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued"));
+
+
+        if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */
+                                !conn->ksnc_tx_scheduled) { /* not being progressed */
+
+                        list_add_tail (&conn->ksnc_tx_list, 
+                                        &ktoenal_data.ksnd_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+}
+
+nal_cb_t ktoenal_lib = {
+        nal_data:       &ktoenal_data,                /* NAL private data */
+        cb_send:         ktoenal_send,
+        cb_recv:         ktoenal_recv,
+        cb_read:         ktoenal_read,
+        cb_write:        ktoenal_write,
+        cb_callback:     ktoenal_callback,
+        cb_malloc:       ktoenal_malloc,
+        cb_free:         ktoenal_free,
+        cb_printf:       ktoenal_printf,
+        cb_cli:          ktoenal_cli,
+        cb_sti:          ktoenal_sti,
+        cb_dist:         ktoenal_dist
+};
diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am
new file mode 100644 (file)
index 0000000..e2e11af
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+MODULE = portals
+modulenet_DATA = portals.o
+EXTRA_PROGRAMS = portals
+
+LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-not-impl.c lib-pid.c
+APILINKS := api-eq.c api-errno.c api-init.c api-md.c api-me.c api-ni.c api-wrap.c
+LINKS = $(APILINKS) $(LIBLINKS) 
+DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej
+
+$(LINKS): link-stamp
+link-stamp:
+       -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       echo timestamp > link-stamp
+
+DEFS =
+portals_SOURCES = $(LINKS) module.c proc.c debug.c
+
+# Don't distribute any patched files.
+dist-hook:
+       list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done
+
+include ../Rules.linux
diff --git a/lnet/libcfs/Makefile.mk b/lnet/libcfs/Makefile.mk
new file mode 100644 (file)
index 0000000..3196ea2
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += libcfs.o
+licfs-objs    := module.o proc.o debug.o
\ No newline at end of file
diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c
new file mode 100644 (file)
index 0000000..6233b8d
--- /dev/null
@@ -0,0 +1,821 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+
+#define DEBUG_OVERFLOW 1024
+static char *debug_buf = NULL;
+static unsigned long debug_size = 0;
+static atomic_t debug_off_a = ATOMIC_INIT(0);
+static int debug_wrapped;
+wait_queue_head_t debug_ctlwq;
+#define DAEMON_SND_SIZE      (64 << 10)
+
+/*
+ * used by the daemon to keep track the offset into debug_buffer for the next
+ * write to the file.  Usually, the daemon is to write out buffer
+ * from debug_daemon_next_write upto debug_off
+ *  variable usage
+ *      Reader - portals_debug_msg()
+ *      Writer - portals_debug_daemon()
+ *               portals_debug_daemon_start() during daemon init time
+ *               portals_debug_daemon_continue() to reset to debug_off
+ *               portals_debug_clear_buffer() reset to debug_off for clear
+ *      Note that *_start(), *_continue() & *clear_buffer() should serialized;
+ */
+static atomic_t   debug_daemon_next_write;
+
+/*
+ * A debug_daemon can be in following states
+ *      stopped - stopped state means there is no debug_daemon running.
+ *                accordingly, it must be in paused state
+ *                a daemon is in !stopped && !paused state after
+ *                "lctl debug_daemon start" creates debug_daemon successfully
+ *                Variable Usage
+ *                      Reader - portals_debug_daemon()
+ *                               portals_debug_set_daemon() routines
+ *                      Writer - portals_debug_set_daemon() routines
+ *                              portals_debug_daemon() on IO error
+ *      paused -  a debug_daemon state is changed from !paused into paused
+ *                when "lctl debug_daemon paused" is issued
+ *                "lctl debug_daemon continue" gets a daemon into !paused mode
+ *                      Reader - portals_debug_set_daemon() routines
+ *                               portals_debug_msg()
+ *                      Writer - portals_debug_set_daemon() on init
+ *                               portals_debug_daemon()
+ *
+ *        Daemon  state diagram.
+ *                      (stopped, paused)
+ *                              |  <-- debug_daemon start
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon pause
+ *                              V
+ *                      (!stopped, paused)
+ *                              |  <-- debug_daemon continue
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon stop
+ *                              V
+ *                      (stopped, paused)
+ *      Overlapped - this is a state when CDEBUG is too fast for the daemon to
+ *                   write out the debug_bufferr.  That is, debug_off is to
+ *                   overlap debug_daemon_next_write;
+ *                     Reader - portals_debug_msg()
+ *                     Writer - portals_debug_msg()
+ */
+
+/*
+ * Description on Trace Daemon Synchronization
+ *
+ * Three categories of code are synchronizing between each other
+ * 1.   lctl, portals_debug_set_daemon(), the user debug control code, 
+ *      as well as portals_debug_clear_buffer()
+ * 2.   CDEBUG, portals_debug_msg(), the debug put messages routine
+ * 3.   Daemon, portals_debug_daemon(), to write out debug log file
+ *
+ *
+ * Three different controls for synchronizations
+ *
+ * 1.   debug_daemon_semaphore
+ *      The usage of this semaphore is to serialize multiple lctl controls 
+ *      in manipulating debug daemon state.  The semaphore serves as the 
+ *      gatekeeper to allow only one user control thread, at any giving time, 
+ *      to access debug daemon state and keeps the other user control requests 
+ *      in wait state until the current control request is serviced.
+ *
+ * 2.   wait_queue_head_t lctl (paired with lctl_event flag)
+ *      Lctl event is the event between portals_debug_set_daemon() and 
+ *      portals_debug_daemon().  Lctl is an indicator for portals_debug_daemon()
+ *      to flush data out to file.  portals_debug_daemon() is to use lctl event
+ *      as signal channel to wakeup portals_debug_set_daemon() upon flush 
+ *      operation is done.
+ *
+ *      Producer :
+ *              portals_debug_daemon() uses to wake up 
+ *              portals_debug_set_daemon(), pause and stop, routines
+ *      Consumer :
+ *              portals_debug_set_daemon(), stop and pause operations, 
+ *              wait and sleep on the event
+ *
+ * 3.   wait_queue_head_t daemon (paired with daemon_event flag)
+ *      This is an event channel to wakeup portals_debug_daemon.  Daemon 
+ *      wakes up to run whenever there is an event posted.   Daemon handles 
+ *      2 types of operations . 1. Writes data out to debug file, 2. Flushes 
+ *      file and terminates base on lctl event. 
+ *      File operation -
+ *              Daemon is normally in a sleep state.  
+ *              Daemon is woken up through daemon event whenever CDEBUG is 
+ *              putting data over any 64K boundary. 
+ *      File flush and termination -
+ *              On portals_debug_daemon_stop/pause() operations, lctl control 
+ *              is to wake up daemon through daemon event.
+ *
+ *      We can't use sleep_on() and wake_up() to replace daemon event because 
+ *      portals_debug_daemon() must catch the wakeup operation posted by 
+ *      portals_debug_daemon_stop/pause().  Otherwise, stop and pause may 
+ *      stuck in lctl wait event.
+ *
+ *      Producer :
+ *           a. portals_debug_daemon_pause() and portals_debug_daemon_stop() 
+ *              uses the event to wake up portals_debug_daemon()
+ *           b. portals_debug_msg() uses the event to wake up 
+ *              portals_debug_daemon() whenever the data output is acrossing 
+ *              a 64K bytes boundary.
+ *      Consumer :
+ *              portals_debug_daemon() wakes up upon daemon event.
+ *
+ * Sequence for portals_debug_daemon_stop() operation
+ *
+ * _Portals_debug_daemon_stop()_          _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      Paused = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Set force_flush flag if lctlevnt
+ *                                      Flush data
+ *                                      Wakeup_event (lctl)
+ *                                      Wait_event(daemon)
+ *      Stopped = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Exit daemon loop if (Stopped)
+ *                                      Wakeup_event (lctl)
+ *                                      Exit
+ *      Return to user application
+ *
+ *
+ * _Portals_debug_msg()_                  _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      If (WriteStart<64K<WriteEnd)
+ *         Wakeup_event(daemon)
+ *                                      Do file IO
+ *                                      Wait_event(daemon)
+ */
+struct debug_daemon_state {
+        unsigned long overlapped;
+        unsigned long stopped;
+        atomic_t paused;
+        unsigned long   lctl_event;     /* event for lctl */
+        wait_queue_head_t lctl;
+        unsigned long   daemon_event;   /* event for daemon */
+        wait_queue_head_t daemon;
+};
+static struct debug_daemon_state debug_daemon_state;
+static DECLARE_MUTEX(debug_daemon_semaphore);
+
+static loff_t daemon_file_size_limit;
+char debug_daemon_file_path[1024] = "";
+
+spinlock_t portals_debug_lock = SPIN_LOCK_UNLOCKED;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+int handled_panic; /* to avoid recursive calls to notifiers */
+char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall";
+
+
+int portals_do_debug_dumplog(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        int rc;
+        mm_segment_t oldfs;
+        unsigned long debug_off;
+
+        kportal_daemonize("");
+
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+        sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME);
+        file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for dumping", debug_file_name);
+                GOTO(out, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+                       debug_file_name);
+        }
+
+        debug_off = atomic_read(&debug_off_a);
+        oldfs = get_fs();
+        set_fs(get_ds());
+        if (debug_wrapped) {
+                rc = file->f_op->write(file, debug_buf + debug_off + 1,
+                                       debug_size-debug_off-1, &file->f_pos);
+                rc += file->f_op->write(file, debug_buf, debug_off + 1,
+                                        &file->f_pos);
+        } else {
+                rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
+        }
+        printk("wrote %d bytes\n", rc);
+        set_fs(oldfs);
+
+        rc = file->f_op->fsync(file, file->f_dentry, 1);
+        if (rc)
+                CERROR("sync returns %d\n", rc);
+        filp_close(file, 0);
+out:
+        current->journal_info = journal_info;
+        wake_up(&debug_ctlwq);
+        return 0;
+}
+
+int portals_debug_daemon(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        mm_segment_t oldfs;
+        unsigned long force_flush = 0;
+        unsigned long size;
+        int rc;
+
+        kportal_daemonize("ldebug_daemon");
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+
+        file = filp_open(debug_daemon_file_path,
+                         O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for logging", debug_daemon_file_path);
+                GOTO(out1, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+                       debug_daemon_file_path);
+        }
+
+        debug_daemon_state.overlapped = 0;
+        debug_daemon_state.stopped = 0;
+        atomic_set(&debug_daemon_state.paused, 0);
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        while (1) {
+                unsigned long ending;
+                unsigned long start, tail;
+                long delta;
+
+                debug_daemon_state.daemon_event = 0;
+
+                ending = atomic_read(&debug_off_a);
+                start = atomic_read(&debug_daemon_next_write);
+
+                /* check if paused is imposed by lctl ? */
+                force_flush = !debug_daemon_state.lctl_event;
+
+                delta = ending - start;
+                tail = debug_size - start;
+                size = (delta >= 0) ? delta : tail;
+                while (size && (force_flush || (delta < 0) ||
+                                (size >= DAEMON_SND_SIZE))) {
+                        if (daemon_file_size_limit) {
+                               int ssize = daemon_file_size_limit - file->f_pos;
+                               if (size > ssize)
+                                        size = ssize;
+                        }
+
+                        rc = file->f_op->write(file, debug_buf+start,
+                                               size, &file->f_pos);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                           "Debug_daemon write error %d\n", rc);
+                                goto out;
+                        }
+                        start += rc;
+                        delta = ending - start;
+                        tail = debug_size - start;
+                        if (tail == 0)
+                                start = 0;
+                        if (delta >= 0)
+                                size = delta;
+                        else
+                                size = (tail == 0) ? ending : tail;
+                        if (daemon_file_size_limit == file->f_pos) {
+                                // file wrapped around
+                                file->f_pos = 0;
+                        }
+                }
+                atomic_set(&debug_daemon_next_write, start);
+                if (force_flush) {
+                        rc = file->f_op->fsync(file, file->f_dentry, 1);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                       "Debug_daemon sync error %d\n", rc);
+                                goto out;
+                        }
+                        if (debug_daemon_state.stopped)
+                               break;           
+                        debug_daemon_state.lctl_event = 1;
+                        wake_up(&debug_daemon_state.lctl);
+                }
+                wait_event(debug_daemon_state.daemon, 
+                           debug_daemon_state.daemon_event);
+                }
+out:
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+        set_fs(oldfs);
+        filp_close(file, 0);
+        current->journal_info = journal_info;
+out1:
+        debug_daemon_state.lctl_event = 1;
+        wake_up(&debug_daemon_state.lctl);
+        return 0;
+}
+
+void portals_debug_print(void)
+{
+        unsigned long dumplen = 64 * 1024;
+        char *start1, *start2;
+        char *end1, *end2;
+        unsigned long debug_off = atomic_read(&debug_off_a);
+
+        start1 = debug_buf + debug_off - dumplen;
+        if (start1 < debug_buf) {
+                start1 += debug_size;
+                end1 = debug_buf + debug_size - 1;
+                start2 = debug_buf;
+                end2 = debug_buf + debug_off;
+        } else {
+                end1 = debug_buf + debug_off;
+                start2 = debug_buf + debug_off;
+                end2 = debug_buf + debug_off;
+        }
+
+        while (start1 < end1) {
+                int count = MIN(1024, end1 - start1);
+                printk("%*s", count, start1);
+                start1 += 1024;
+        }
+        while (start2 < end2) {
+                int count = MIN(1024, end2 - start2);
+                printk("%*s", count, start2);
+                start2 += 1024;
+        }
+}
+
+void portals_debug_dumplog(void)
+{
+        int rc;
+        ENTRY;
+
+        init_waitqueue_head(&debug_ctlwq);
+
+        rc = kernel_thread(portals_do_debug_dumplog,
+                           NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start dump thread\n");
+                return;
+        }
+        sleep_on(&debug_ctlwq);
+}
+
+int portals_debug_daemon_start(char *file, unsigned int size)
+{
+        int rc;
+
+        if (!debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (file != NULL)
+                strncpy(debug_daemon_file_path, file, 1024);
+
+        init_waitqueue_head(&debug_daemon_state.lctl);
+        init_waitqueue_head(&debug_daemon_state.daemon);
+
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+
+        daemon_file_size_limit = size << 20;
+
+        debug_daemon_state.lctl_event = 0;
+        rc = kernel_thread(portals_debug_daemon, NULL, 0);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start debug daemon thread\n");
+                strncpy(debug_daemon_file_path, "\0", 1);
+                return rc;
+        }
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_pause(void)
+{
+        if (atomic_read(&debug_daemon_state.paused))
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_continue(void)
+{
+        if (!atomic_read(&debug_daemon_state.paused))
+                return -EINVAL;
+        if (debug_daemon_state.stopped)
+                return -EINVAL;
+
+        debug_daemon_state.overlapped = 0;
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+        atomic_set(&debug_daemon_state.paused, 0);
+        return 0;
+}
+
+int portals_debug_daemon_stop(void)
+{
+        if (debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (!atomic_read(&debug_daemon_state.paused))
+                portals_debug_daemon_pause();
+
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.stopped = 1;
+
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+
+        debug_daemon_file_path[0] = '\0';
+        return 0;
+}
+
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                             char *filename, unsigned int size)
+{
+        int rc = -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        switch (cmd) {
+                case DEBUG_DAEMON_START:
+                        if (length && (filename[length -1] != '\0')) {
+                                CERROR("Invalid filename for debug_daemon\n");
+                                rc = -EINVAL;
+                                break;
+                        }
+                        rc = portals_debug_daemon_start(filename, size);
+                        break;
+                case DEBUG_DAEMON_STOP:
+                        rc = portals_debug_daemon_stop();
+                        break;
+                case DEBUG_DAEMON_PAUSE:
+                        rc = portals_debug_daemon_pause();
+                        break;
+                case DEBUG_DAEMON_CONTINUE:
+                        rc = portals_debug_daemon_continue();
+                        break;
+                default:
+                        CERROR("unknown set_daemon cmd\n");
+        }
+        up(&debug_daemon_semaphore);
+        return rc;
+}
+
+static int panic_dumplog(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (handled_panic)
+                return 0;
+        else
+                handled_panic = 1;
+
+        if (in_interrupt()) {
+                portals_debug_print();
+                return 0;
+        }
+
+        while (current->lock_depth >= 0)
+                unlock_kernel();
+        portals_debug_dumplog();
+        return 0;
+}
+
+static struct notifier_block lustre_panic_notifier = {
+        notifier_call :     panic_dumplog,
+        next :              NULL,
+        priority :          10000
+};
+
+int portals_debug_init(unsigned long bufsize)
+{
+        unsigned long debug_off = atomic_read(&debug_off_a);
+        if (debug_buf != NULL)
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+
+        debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW);
+        if (debug_buf == NULL)
+                return -ENOMEM;
+        memset(debug_buf, 0, debug_size);
+        debug_wrapped = 0;
+
+        printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n",
+               bufsize, debug_buf);
+        atomic_set(&debug_off_a, debug_off);
+        notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier);
+        debug_size = bufsize;
+
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier);
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        portals_debug_daemon_stop();
+
+        vfree(debug_buf);
+        atomic_set(&debug_off_a, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+int portals_debug_clear_buffer(void)
+{
+        unsigned long flags;
+        unsigned long state;
+
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        state = atomic_read(&debug_daemon_state.paused);
+        if (!state)
+                portals_debug_daemon_pause();
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        atomic_set(&debug_off_a, 0);
+        debug_wrapped = 0;
+        atomic_set(&debug_daemon_next_write, 0);
+        debug_daemon_state.overlapped = 0;
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        if (!state)
+                atomic_set(&debug_daemon_state.paused, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+/* Debug markers, although printed by S_PORTALS
+ * should not be be marked as such.
+ */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int portals_debug_mark_buffer(char *text)
+{
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        CDEBUG(0, "*******************************************************************************\n");
+        CDEBUG(0, "DEBUG MARKER: %s\n", text);
+        CDEBUG(0, "*******************************************************************************\n");
+
+        return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        int rc;
+        unsigned long debug_off;
+        unsigned long flags;
+
+        if (len < debug_size)
+                return -ENOSPC;
+
+        debug_off = atomic_read(&debug_off_a);
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        if (debug_wrapped) {
+                /* All of this juggling with the 1s is to keep the trailing nul
+                 * (which falls at debug_buf + debug_off) at the end of what we
+                 * copy into user space */
+                copy_to_user(buf, debug_buf + debug_off + 1,
+                             debug_size - debug_off - 1);
+                copy_to_user(buf + debug_size - debug_off - 1,
+                             debug_buf, debug_off + 1);
+                rc = debug_size;
+        } else {
+                copy_to_user(buf, debug_buf, debug_off);
+                rc = debug_off;
+        }
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        return rc;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   unsigned long stack, const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        int           max_nob;
+        int           prefix_nob;
+        int           msg_nob;
+        struct timeval tv;
+        unsigned long base_offset;
+        unsigned long debug_off;
+
+        if (debug_buf == NULL) {
+                printk("portals_debug_msg: debug_buf is NULL!\n");
+                return;
+        }
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        debug_off = atomic_read(&debug_off_a);
+        if (!atomic_read(&debug_daemon_state.paused)) {
+                unsigned long available;
+                long delta;
+                long v = atomic_read(&debug_daemon_next_write);
+
+                delta = debug_off - v;
+                available = (delta>=0) ? debug_size-delta : -delta;
+                // Check if we still have enough debug buffer for CDEBUG
+                if (available < DAEMON_SND_SIZE) {
+                        /* Drop CDEBUG packets until enough debug_buffer is
+                         * available */
+                        if (debug_daemon_state.overlapped)
+                                 goto out;
+                        /* If this is the first time, leave a marker in the
+                         * output */
+                        debug_daemon_state.overlapped = 1;
+                        ap = NULL;
+                        format = "DEBUG MARKER: Debug buffer overlapped\n";
+                } else  /* More space just became available */
+                        debug_daemon_state.overlapped = 0;
+        }
+
+        max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
+        if (max_nob <= 0) {
+                spin_unlock_irqrestore(&portals_debug_lock, flags);
+                printk("logic error in portals_debug_msg: <0 bytes to write\n");
+                return;
+        }
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        do_gettimeofday(&tv);
+
+        prefix_nob = snprintf(debug_buf + debug_off, max_nob,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id(),
+                              tv.tv_sec, tv.tv_usec);
+        max_nob -= prefix_nob;
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.extern_pid, stack);
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.mode.tt.extern_pid, stack);
+#else
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d+%lu): ",
+                           file, line, fn, current->pid, stack);
+#endif
+        max_nob -= msg_nob;
+
+        va_start(ap, format);
+        msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob,
+                            max_nob, format, ap);
+        max_nob -= msg_nob;
+        va_end(ap);
+
+        /* Print to console, while msg is contiguous in debug_buf */
+        /* NB safely terminated see above */
+        if ((mask & D_EMERG) != 0)
+                printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+        if ((mask & D_ERROR) != 0)
+                printk(KERN_ERR   "%s", debug_buf + debug_off + prefix_nob);
+        else if (portal_printk)
+                printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+        base_offset = debug_off & 0xFFFF;
+
+        debug_off += prefix_nob + msg_nob;
+        if (debug_off > debug_size) {
+                memcpy(debug_buf, debug_buf + debug_size,
+                       debug_off - debug_size + 1);
+                debug_off -= debug_size;
+                debug_wrapped = 1;
+        }
+
+        atomic_set(&debug_off_a, debug_off);
+        if (!atomic_read(&debug_daemon_state.paused) &&
+            ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) {
+                debug_daemon_state.daemon_event = 1;
+                wake_up(&debug_daemon_state.daemon);
+        }
+out:
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+}
+
+void portals_debug_set_level(unsigned int debug_level)
+{
+        printk("Setting portals debug level to %08x\n", debug_level);
+        portal_debug = debug_level;
+}
+
+void portals_run_lbug_upcall(char * file, char *fn, int line)
+{
+        char *argv[6];
+        char *envp[3];
+        char buf[32];
+        int rc;
+
+        ENTRY;
+        snprintf (buf, sizeof buf, "%d", line);
+
+        argv[0] = portals_upcall;
+        argv[1] = "LBUG";
+        argv[2] = file;
+        argv[3] = fn;
+        argv[4] = buf;
+        argv[5] = NULL;
+
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+
+        rc = call_usermodehelper(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
+                       "/proc/sys/portals/upcall\n",                
+                       argv[0], argv[1], argv[2], argv[3], argv[4], rc);
+                
+        } else {
+                CERROR("Invoked upcall %s %s %s %s %s\n",
+                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+        }
+}
+
+
+EXPORT_SYMBOL(portals_debug_dumplog);
+EXPORT_SYMBOL(portals_debug_msg);
+EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_lbug_upcall);
diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c
new file mode 100644 (file)
index 0000000..1b9e5bb
--- /dev/null
@@ -0,0 +1,572 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+#include <portals/lib-p30.h>
+#include <portals/p30.h>
+#include <linux/kp30.h>
+
+#define PORTAL_MINOR 240
+
+extern void (kping_client)(struct portal_ioctl_data *);
+
+struct nal_cmd_handler {
+        nal_cmd_handler_t nch_handler;
+        void * nch_private;
+};
+
+static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1];
+struct semaphore nal_cmd_sem;
+
+#ifdef PORTAL_DEBUG
+void
+kportal_assertion_failed (char *expr, char *file, char *func, int line)
+{
+        unsigned long stack = CDEBUG_STACK(stack);
+        portals_debug_msg(0, D_EMERG, file, func, line, stack,
+                          "ASSERTION(%s) failed\n", expr);
+        LBUG();
+}
+#endif
+
+void
+kportal_daemonize (char *str) 
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+}
+
+void
+kportal_blockallsigs ()
+{
+        unsigned long  flags;
+
+        spin_lock_irqsave (&current->sigmask_lock, flags);
+        siginitsetinv (&current->blocked, 0);
+        recalc_sigpending (current);
+        spin_unlock_irqrestore (&current->sigmask_lock, flags);
+}
+
+/* called when opening /dev/device */
+static int kportal_psdev_open(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+        PORTAL_MODULE_USE;
+        RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int kportal_psdev_release(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+
+        PORTAL_MODULE_UNUSE;
+        RETURN(0);
+}
+
+static inline void freedata(void *data, int len)
+{
+        PORTAL_FREE(data, len);
+}
+
+static int
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+                  ptl_nid_t hi_nid)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_del_route(ptl_nid_t target)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_del_route (target);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
+                  ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+{
+        int       gateway_nalid;
+        ptl_nid_t gateway_nid;
+        ptl_nid_t lo_nid;
+        ptl_nid_t hi_nid;
+        int       rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
+                                 &hi_nid);
+
+        if (rc == 0) {
+                CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
+                       index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+                *gateway_nalidp = (__u32)gateway_nalid;
+                *gateway_nidp   = (__u32)gateway_nid;
+                *lo_nidp        = (__u32)lo_nid;
+                *hi_nidp        = (__u32)hi_nid;
+        }
+
+        PORTAL_SYMBOL_PUT (kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_nal_cmd(int nal, struct portal_ioctl_data *data)
+{
+        int rc = -EINVAL;
+
+        ENTRY;
+
+        down(&nal_cmd_sem);
+        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd);
+                rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private);
+        }
+        up(&nal_cmd_sem);
+        RETURN(rc);
+}
+
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                return (PORTAL_SYMBOL_GET(kqswnal_ni));
+        case SOCKNAL:
+                return (PORTAL_SYMBOL_GET(ksocknal_ni));
+        case TOENAL:
+                return  (PORTAL_SYMBOL_GET(ktoenal_ni));
+        case GMNAL:
+                return  (PORTAL_SYMBOL_GET(kgmnal_ni));
+        case TCPNAL:
+                /* userspace NAL */
+                return (NULL);
+        case SCIMACNAL:
+                return  (PORTAL_SYMBOL_GET(kscimacnal_ni));
+        default:
+                /* A warning to a naive caller */
+                CERROR ("unknown nal: %d\n", nal);
+                return (NULL);
+        }
+}
+
+void
+kportal_put_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                PORTAL_SYMBOL_PUT(kqswnal_ni);
+                break;
+        case SOCKNAL:
+                PORTAL_SYMBOL_PUT(ksocknal_ni);
+                break;
+        case TOENAL:
+                PORTAL_SYMBOL_PUT(ktoenal_ni);
+                break;
+        case GMNAL:
+                PORTAL_SYMBOL_PUT(kgmnal_ni);
+                break;
+        case TCPNAL:
+                /* A lesson to a malicious caller */
+                LBUG ();
+        case SCIMACNAL:
+                PORTAL_SYMBOL_PUT(kscimacnal_ni);
+                break;
+        default:
+                CERROR ("unknown nal: %d\n", nal);
+        }
+}
+
+int
+kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                if (nal_cmd[nal].nch_handler != NULL)
+                        rc = -EBUSY;
+                else {
+                        nal_cmd[nal].nch_handler = handler;
+                        nal_cmd[nal].nch_private = private;
+                }
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+int
+kportal_nal_unregister(int nal)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                nal_cmd[nal].nch_handler = NULL;
+                nal_cmd[nal].nch_private = NULL;
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+
+static int kportal_ioctl(struct inode *inode, struct file *file,
+                         unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        char buf[1024];
+        struct portal_ioctl_data *data;
+
+        ENTRY;
+
+        if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE ||
+             _IOC_NR(cmd) < IOC_PORTAL_MIN_NR  ||
+             _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) {
+                CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                                _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+                RETURN(-EINVAL);
+        }
+
+        if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+                CERROR("PORTALS ioctl: data error\n");
+                RETURN(-EINVAL);
+        }
+
+        data = (struct portal_ioctl_data *)buf;
+
+        switch (cmd) {
+        case IOC_PORTAL_SET_DAEMON: 
+                RETURN (portals_debug_set_daemon ( 
+                                        (unsigned int) data->ioc_count,
+                                        (unsigned int) data->ioc_inllen1,
+                                        (char *) data->ioc_inlbuf1,
+                                        (unsigned int) data->ioc_misc)); 
+        case IOC_PORTAL_GET_DEBUG: {
+                __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1,
+                                                        data->ioc_plen1);
+
+                if (size < 0)
+                        RETURN(size);
+
+                data->ioc_size = size;
+                err = copy_to_user((char *)arg, data, sizeof(*data));
+                RETURN(err);
+        }
+        case IOC_PORTAL_CLEAR_DEBUG:
+                portals_debug_clear_buffer();
+                RETURN(0);
+        case IOC_PORTAL_PANIC:
+                if (!capable (CAP_SYS_BOOT))
+                        RETURN (-EPERM);
+                panic("debugctl-invoked panic");
+                RETURN(0);
+        case IOC_PORTAL_MARK_DEBUG:
+                if (data->ioc_inlbuf1 == NULL ||
+                    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                        RETURN(-EINVAL);
+                portals_debug_mark_buffer(data->ioc_inlbuf1);
+                RETURN(0);
+        case IOC_PORTAL_PING: {
+                void (*ping)(struct portal_ioctl_data *);
+
+                CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n",
+                       data->ioc_count, data->ioc_nid);
+                ping = PORTAL_SYMBOL_GET(kping_client);
+                if (!ping)
+                        CERROR("PORTAL_SYMBOL_GET failed\n");
+                else {
+                        ping(data);
+                        PORTAL_SYMBOL_PUT(kping_client);
+                }
+                RETURN(0);
+        }
+
+        case IOC_PORTAL_ADD_ROUTE:
+                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                       data->ioc_nal, data->ioc_nid, data->ioc_nid2,
+                       data->ioc_nid3);
+                err = kportal_add_route(data->ioc_nal, data->ioc_nid,
+                                        MIN (data->ioc_nid2, data->ioc_nid3),
+                                        MAX (data->ioc_nid2, data->ioc_nid3));
+                break;
+
+        case IOC_PORTAL_DEL_ROUTE:
+                CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
+                err = kportal_del_route (data->ioc_nid);
+                break;
+
+        case IOC_PORTAL_GET_ROUTE:
+                CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
+                err = kportal_get_route(data->ioc_count, &data->ioc_nal,
+                                        &data->ioc_nid, &data->ioc_nid2,
+                                        &data->ioc_nid3);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_GET_NID: {
+                const ptl_handle_ni_t *nip;
+                ptl_process_id_t       pid;
+
+                CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        RETURN (-EINVAL);
+
+                err = PtlGetId (*nip, &pid);
+                LASSERT (err == PTL_OK);
+                kportal_put_ni (data->ioc_nal);
+
+                data->ioc_nid = pid.nid;
+                if (copy_to_user ((char *)arg, data, sizeof (*data)))
+                        err = -EFAULT;
+                break;
+        }
+
+        case IOC_PORTAL_NAL_CMD:
+                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal,
+                        data->ioc_nal_cmd);
+                err = kportal_nal_cmd(data->ioc_nal, data);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_FAIL_NID: {
+                const ptl_handle_ni_t *nip;
+
+                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
+                        data->ioc_nal, data->ioc_nid, data->ioc_count);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        return (-EINVAL);
+
+                err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count);
+                break;
+        }
+
+        default:
+                err = -EINVAL;
+                break;
+        }
+
+        RETURN(err);
+}
+
+
+static struct file_operations portalsdev_fops = {
+        ioctl:   kportal_ioctl,
+        open:    kportal_psdev_open,
+        release: kportal_psdev_release
+};
+
+
+static struct miscdevice portal_dev = {
+        PORTAL_MINOR,
+        "portals",
+        &portalsdev_fops
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+static int init_kportals_module(void)
+{
+        int rc;
+
+        rc = portals_debug_init(5 * 1024 * 1024);
+        if (rc < 0) {
+                printk(KERN_ERR "portals_debug_init: %d\n", rc);
+                return (rc);
+        }
+
+        sema_init(&nal_cmd_sem, 1);
+
+        rc = misc_register(&portal_dev);
+        if (rc) {
+                CERROR("misc_register: error %d\n", rc);
+                goto cleanup_debug;
+        }
+
+        rc = PtlInit();
+        if (rc) {
+                CERROR("PtlInit: error %d\n", rc);
+                goto cleanup_deregister;
+        }
+
+        rc = insert_proc();
+        if (rc) {
+                CERROR("insert_proc: error %d\n", rc);
+                goto cleanup_fini;
+        }
+
+        CDEBUG (D_OTHER, "portals setup OK\n");
+        return (0);
+
+ cleanup_fini:
+        PtlFini();
+ cleanup_deregister:
+        misc_deregister(&portal_dev);
+ cleanup_debug:
+        portals_debug_cleanup();
+        return rc;
+}
+
+static void exit_kportals_module(void)
+{
+        int rc;
+
+        remove_proc();
+        PtlFini();
+
+        CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+
+        rc = misc_deregister(&portal_dev);
+        if (rc)
+                CERROR("misc_deregister error %d\n", rc);
+
+        if (atomic_read(&portal_kmemory) != 0)
+                CERROR("Portals memory leaked: %d bytes\n",
+                       atomic_read(&portal_kmemory));
+
+        rc = portals_debug_cleanup();
+        if (rc)
+                printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+}
+
+EXPORT_SYMBOL(lib_dispatch);
+EXPORT_SYMBOL(PtlMEAttach);
+EXPORT_SYMBOL(PtlMEInsert);
+EXPORT_SYMBOL(PtlMEUnlink);
+EXPORT_SYMBOL(PtlEQAlloc);
+EXPORT_SYMBOL(PtlMDAttach);
+EXPORT_SYMBOL(PtlMDUnlink);
+EXPORT_SYMBOL(PtlNIInit);
+EXPORT_SYMBOL(PtlNIFini);
+EXPORT_SYMBOL(PtlNIDebug);
+EXPORT_SYMBOL(PtlInit);
+EXPORT_SYMBOL(PtlFini);
+EXPORT_SYMBOL(PtlPut);
+EXPORT_SYMBOL(PtlGet);
+EXPORT_SYMBOL(ptl_err_str);
+EXPORT_SYMBOL(portal_subsystem_debug);
+EXPORT_SYMBOL(portal_debug);
+EXPORT_SYMBOL(portal_stack);
+EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(PtlEQWait);
+EXPORT_SYMBOL(PtlEQFree);
+EXPORT_SYMBOL(PtlEQGet);
+EXPORT_SYMBOL(PtlGetId);
+EXPORT_SYMBOL(PtlMDBind);
+EXPORT_SYMBOL(lib_iov_nob);
+EXPORT_SYMBOL(lib_copy_iov2buf);
+EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_kiov_nob);
+EXPORT_SYMBOL(lib_copy_kiov2buf);
+EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_finalize);
+EXPORT_SYMBOL(lib_parse);
+EXPORT_SYMBOL(lib_init);
+EXPORT_SYMBOL(lib_fini);
+EXPORT_SYMBOL(portal_kmemory);
+EXPORT_SYMBOL(kportal_daemonize);
+EXPORT_SYMBOL(kportal_blockallsigs);
+EXPORT_SYMBOL(kportal_nal_register);
+EXPORT_SYMBOL(kportal_nal_unregister);
+EXPORT_SYMBOL(kportal_assertion_failed);
+EXPORT_SYMBOL(dispatch_name);
+EXPORT_SYMBOL(kportal_get_ni);
+EXPORT_SYMBOL(kportal_put_ni);
+
+module_init(init_kportals_module);
+module_exit (exit_kportals_module);
diff --git a/lnet/libcfs/proc.c b/lnet/libcfs/proc.c
new file mode 100644 (file)
index 0000000..2fa739a
--- /dev/null
@@ -0,0 +1,290 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+#include <asm/div64.h>
+
+static struct ctl_table_header *portals_table_header = NULL;
+extern char debug_file_path[1024];
+extern char debug_daemon_file_path[1024];
+extern char portals_upcall[1024];
+
+#define PSDEV_PORTALS  (0x100)
+#define PSDEV_DEBUG           1   /* control debugging */
+#define PSDEV_SUBSYSTEM_DEBUG 2   /* control debugging */
+#define PSDEV_PRINTK          3   /* force all errors to console */
+#define PSDEV_DEBUG_PATH      4   /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 5   /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL  6   /* User mode upcall script  */
+
+#define PORTALS_PRIMARY_CTLCNT 6
+static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
+        {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+        {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
+         sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table},
+        {0}
+};
+
+
+#ifdef PORTALS_PROFILING
+/*
+ * profiling stuff.  we do this statically for now 'cause its simple,
+ * but we could do some tricks with elf sections to have this array
+ * automatically built.
+ */
+#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
+
+struct prof_ent prof_ents[] = {
+        def_prof(our_recvmsg),
+        def_prof(our_sendmsg),
+        def_prof(socknal_recv),
+        def_prof(lib_parse),
+        def_prof(conn_list_walk),
+        def_prof(memcpy),
+        def_prof(lib_finalize),
+        def_prof(pingcli_time),
+        def_prof(gmnal_send),
+        def_prof(gmnal_recv),
+};
+
+EXPORT_SYMBOL(prof_ents);
+
+/*
+ * this function is as crazy as the proc filling api
+ * requires.
+ *
+ * buffer: page allocated for us to scribble in.  the
+ *  data returned to the user will be taken from here.
+ * *start: address of the pointer that will tell the 
+ *  caller where in buffer the data the user wants is.
+ * ppos: offset in the entire /proc file that the user
+ *  currently wants.
+ * wanted: the amount of data the user wants.
+ *
+ * while going, 'curpos' is the offset in the entire
+ * file where we currently are.  We only actually
+ * start filling buffer when we get to a place in
+ * the file that the user cares about.
+ *
+ * we take care to only sprintf when the user cares because
+ * we're holding a lock while we do this.
+ *
+ * we're smart and know that we generate fixed size lines.
+ * we only start writing to the buffer when the user cares.
+ * This is unpredictable because we don't snapshot the
+ * list between calls that are filling in a file from
+ * the list.  The list could change mid read and the
+ * output will look very weird indeed.  oh well.
+ */
+
+static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
+                          int *eof, void *data)
+{
+        int len = 0, i;
+        int curpos;
+        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
+        int header_len = strlen(header);
+        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
+        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
+
+        *start = buffer;
+
+        if (ppos < header_len) {
+                int diff = MIN(header_len, wanted);
+                memcpy(buffer, header + ppos, diff);
+                len += diff;
+                ppos += diff;
+        }
+
+        if (len >= wanted)
+                goto out;
+
+        curpos = header_len;
+
+        for ( i = 0; i < MAX_PROFS ; i++) {
+                int copied;
+                struct prof_ent *pe = &prof_ents[i];
+                long long cycles_per;
+                /*
+                 * find the part of the array that the buffer wants
+                 */
+                if (ppos >= (curpos + line_len))  {
+                        curpos += line_len;
+                        continue;
+                }
+                /* the clever caller split a line */
+                if (ppos > curpos) {
+                        *start = buffer + (ppos - curpos);
+                }
+
+                if (pe->finishes == 0)
+                        cycles_per = 0;
+                else
+                {
+                        cycles_per = pe->total_cycles;
+                        do_div (cycles_per, pe->finishes);
+                }
+
+                copied = sprintf(buffer + len, format, pe->str, cycles_per,
+                                 pe->starts, pe->finishes, pe->total_cycles);
+
+                len += copied;
+
+                /* pad to line len, -1 for \n */
+                if ((copied < line_len-1)) {
+                        int diff = (line_len-1) - copied;
+                        memset(buffer + len, ' ', diff);
+                        len += diff;
+                        copied += diff;
+                }
+
+                buffer[len++]= '\n';
+
+                /* bail if we have enough */
+                if (((buffer + len) - *start) >= wanted)
+                        break;
+
+                curpos += line_len;
+        }
+
+        /* lameness */
+        if (i == MAX_PROFS)
+                *eof = 1;
+ out:
+
+        return MIN(((buffer + len) - *start), wanted);
+}
+
+/*
+ * all kids love /proc :/
+ */
+static unsigned char basedir[]="net/portals";
+#endif /* PORTALS_PROFILING */
+
+int insert_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        struct proc_dir_entry *ent;
+
+        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
+                CERROR("profiling enum and array are out of sync.\n");
+                return -1;
+        }
+
+        /*
+         * This is pretty lame.  assuming that failure just
+         * means that they already existed.
+         */
+        strcat(dir, basedir);
+        create_proc_entry(dir, S_IFDIR, 0);
+
+        strcat(dir, "/cycles");
+        ent = create_proc_entry(dir, 0, 0);
+        if (!ent) {
+                CERROR("couldn't register %s?\n", dir);
+                return -1;
+        }
+
+        ent->data = NULL;
+        ent->read_proc = prof_read_proc;
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (!portals_table_header)
+                portals_table_header = register_sysctl_table(top_table, 0);
+#endif
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        int end;
+
+        dir[0]='\0';
+        strcat(dir, basedir);
+
+        end = strlen(dir);
+
+        strcat(dir, "/cycles");
+        remove_proc_entry(dir,0);
+
+        dir[end] = '\0';
+        remove_proc_entry(dir,0);
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
diff --git a/lnet/lnet/Makefile.am b/lnet/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..9fb7f6f
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
+lib_LIBRARIES= libportals.a
+libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-md.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-not-impl.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
diff --git a/lnet/lnet/Makefile.mk b/lnet/lnet/Makefile.mk
new file mode 100644 (file)
index 0000000..5627ef7
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += portals.o
+portals-objs    := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o
diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c
new file mode 100644 (file)
index 0000000..57427f6
--- /dev/null
@@ -0,0 +1,161 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-eq.c
+ * User-level event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * PtlMDUpdate is here so that it can access the per-eventq
+ * structures.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_eq_init(void)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_fini(void)
+{
+        /* Nothing to do anymore... */
+}
+
+int ptl_eq_ni_init(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_ni_fini(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+}
+
+int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
+{
+        ptl_eq_t *eq;
+        int rc, new_index;
+        unsigned long flags;
+        ptl_event_t *new_event;
+        nal_t *nal;
+        ENTRY;
+
+        if (!ptl_init)
+                RETURN(PTL_NOINIT);
+
+        nal = ptl_hndl2nal(&eventq);
+        if (!nal)
+                RETURN(PTL_INV_EQ);
+
+        eq = ptl_handle2usereq(&eventq);
+        nal->lock(nal, &flags);
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+
+        new_index = eq->sequence & (eq->size - 1);
+        new_event = &eq->base[new_index];
+        CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n",
+               new_event, eq->sequence, eq->size);
+        if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) {
+                nal->unlock(nal, &flags);
+                RETURN(PTL_EQ_EMPTY);
+        }
+
+        *ev = *new_event;
+
+        /* Set the unlinked_me interface number if there is one to pass
+         * back, since the NAL hasn't a clue what it is and therefore can't
+         * set it. */
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
+                ev->unlinked_me.nal_idx = eventq.nal_idx;
+        
+        /* ensure event is delivered correctly despite possible 
+           races with lib_finalize */
+        if (eq->sequence != new_event->sequence) {
+                CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n",
+                       eq->sequence, new_event->sequence);
+                rc = PTL_EQ_DROPPED;
+        } else {
+                rc = PTL_OK;
+        }
+
+        eq->sequence = new_event->sequence + 1;
+        nal->unlock(nal, &flags);
+        RETURN(rc);
+}
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
+{
+        int rc;
+        
+        /* PtlEQGet does the handle checking */
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+        }
+
+        return rc;
+}
+
+#ifndef __KERNEL__
+static jmp_buf eq_jumpbuf;
+
+static void eq_timeout(int signal)
+{
+        longjmp(eq_jumpbuf, -1);
+}
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        static void (*prev) (int);
+        static int left_over;
+        time_t time_at_start;
+        int rc;
+
+        if (setjmp(eq_jumpbuf)) {
+                signal(SIGALRM, prev);
+                alarm(left_over - timeout);
+                return PTL_EQ_EMPTY;
+        }
+
+        left_over = alarm(timeout);
+        prev = signal(SIGALRM, eq_timeout);
+        time_at_start = time(NULL);
+        if (left_over < timeout)
+                alarm(left_over);
+
+        rc = PtlEQWait(eventq_in, event_out);
+
+        signal(SIGALRM, prev);
+        alarm(left_over);       /* Should compute how long we waited */
+
+        return rc;
+}
+
+#endif
+
diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c
new file mode 100644 (file)
index 0000000..5cb0980
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-errno.c
+ * Instantiate the string table of errors
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+/* If you change these, you must update the number table in portals/errno.h */
+const char *ptl_err_str[] = {
+        "PTL_OK",
+        "PTL_SEGV",
+
+        "PTL_NOSPACE",
+        "PTL_INUSE",
+        "PTL_VAL_FAILED",
+
+        "PTL_NAL_FAILED",
+        "PTL_NOINIT",
+        "PTL_INIT_DUP",
+        "PTL_INIT_INV",
+        "PTL_AC_INV_INDEX",
+
+        "PTL_INV_ASIZE",
+        "PTL_INV_HANDLE",
+        "PTL_INV_MD",
+        "PTL_INV_ME",
+        "PTL_INV_NI",
+/* If you change these, you must update the number table in portals/errno.h */
+        "PTL_ILL_MD",
+        "PTL_INV_PROC",
+        "PTL_INV_PSIZE",
+        "PTL_INV_PTINDEX",
+        "PTL_INV_REG",
+
+        "PTL_INV_SR_INDX",
+        "PTL_ML_TOOLONG",
+        "PTL_ADDR_UNKNOWN",
+        "PTL_INV_EQ",
+        "PTL_EQ_DROPPED",
+
+        "PTL_EQ_EMPTY",
+        "PTL_NOUPDATE",
+        "PTL_FAIL",
+        "PTL_NOT_IMPLEMENTED",
+        "PTL_NO_ACK",
+
+        "PTL_IOV_TOO_MANY",
+        "PTL_IOV_TOO_SMALL",
+
+        "PTL_EQ_INUSE",
+        "PTL_MD_INUSE"
+};
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c
new file mode 100644 (file)
index 0000000..b54f684
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-init.c
+ * Initialization and global data for the p30 user side library
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * All handles have their interface number stored in the second 16 bit word
+ */
+
+#include <portals/api-support.h>
+
+int ptl_init;
+unsigned int portal_subsystem_debug = 0xfff7e3ff;
+unsigned int portal_debug = ~0;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+#ifdef __KERNEL__
+atomic_t portal_kmemory = ATOMIC_INIT(0);
+#endif
+
+int __p30_initialized;
+int __p30_myr_initialized;
+int __p30_ip_initialized;
+ptl_handle_ni_t __myr_ni_handle;
+ptl_handle_ni_t __ip_ni_handle;
+
+int __p30_myr_timeout = 10;
+int __p30_ip_timeout;
+
+int PtlInit(void)
+{
+
+        if (ptl_init)
+                return PTL_OK;
+
+        ptl_ni_init();
+        ptl_me_init();
+        ptl_eq_init();
+        ptl_init = 1;
+        __p30_initialized = 1;
+
+        return PTL_OK;
+}
+
+
+void PtlFini(void)
+{
+
+        /* Reverse order of initialization */
+        ptl_eq_fini();
+        ptl_me_fini();
+        ptl_ni_fini();
+        ptl_init = 0;
+}
diff --git a/lnet/lnet/api-md.c b/lnet/lnet/api-md.c
new file mode 100644 (file)
index 0000000..967112f
--- /dev/null
@@ -0,0 +1,9 @@
+/*
+ * api-p30/md.c
+ *
+ * Memory descriptor functions that need address validation
+ * There are a few standing issues...
+ *  - Addresses are invalidated by the library without telling us.
+ */
+#include <portals/api-support.h>
+
diff --git a/lnet/lnet/api-me.c b/lnet/lnet/api-me.c
new file mode 100644 (file)
index 0000000..573e948
--- /dev/null
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-me.c
+ * Match Entry local operations.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_me_init(void)
+{
+        return PTL_OK;
+}
+void ptl_me_fini(void)
+{                                /* Nothing to do */
+}
+int ptl_me_ni_init(nal_t * nal)
+{
+        return PTL_OK;
+}
+
+void ptl_me_ni_fini(nal_t * nal)
+{                                /* Nothing to do... */
+}
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c
new file mode 100644 (file)
index 0000000..952da4f
--- /dev/null
@@ -0,0 +1,184 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-ni.c
+ * Network Interface code
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+#define MAX_NIS 8
+static nal_t *ptl_interfaces[MAX_NIS];
+int ptl_num_interfaces = 0;
+
+nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
+{
+        unsigned int idx = handle->nal_idx;
+
+        /* XXX we really rely on the caller NOT racing with interface
+         * setup/teardown.  That ensures her NI handle can't get
+         * invalidated out from under her (or worse, swapped for a
+         * completely different interface!) */
+        
+        if (idx < MAX_NIS)
+                return ptl_interfaces[idx];
+
+        return NULL;
+}
+
+int ptl_ni_init(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++)
+                ptl_interfaces[i] = NULL;
+
+        return PTL_OK;
+}
+
+void ptl_ni_fini(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++) {
+                nal_t *nal = ptl_interfaces[i];
+                if (!nal)
+                        continue;
+
+                if (nal->shutdown)
+                        nal->shutdown(nal, i);
+        }
+}
+
+#ifdef __KERNEL__
+DECLARE_MUTEX(ptl_ni_init_mutex);
+
+static void ptl_ni_init_mutex_enter (void) 
+{
+        down (&ptl_ni_init_mutex);
+}
+
+static void ptl_ni_init_mutex_exit (void)
+{
+        up (&ptl_ni_init_mutex);
+}
+
+#else
+static void ptl_ni_init_mutex_enter (void)
+{
+}
+
+static void ptl_ni_init_mutex_exit (void) 
+{
+}
+
+#endif
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t acl_size, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * handle)
+{
+        nal_t *nal;
+        int i;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid);
+
+        if (!nal) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_NAL_FAILED;
+        }
+
+        for (i = 0; i < ptl_num_interfaces; i++) {
+                if (ptl_interfaces[i] == nal) {
+                        nal->refct++;
+                        handle->nal_idx = i;
+                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        ptl_ni_init_mutex_exit ();
+                        return PTL_OK;
+                }
+        }
+        nal->refct = 1;
+
+        handle->nal_idx = ptl_num_interfaces;
+        if (ptl_num_interfaces >= MAX_NIS) {
+                if (nal->shutdown)
+                        nal->shutdown (nal, ptl_num_interfaces);
+                ptl_ni_init_mutex_exit ();
+                return PTL_NOSPACE;
+        }
+
+        ptl_interfaces[ptl_num_interfaces++] = nal;
+
+        ptl_eq_ni_init(nal);
+        ptl_me_ni_init(nal);
+
+        ptl_ni_init_mutex_exit ();
+        return PTL_OK;
+}
+
+
+int PtlNIFini(ptl_handle_ni_t ni)
+{
+        nal_t *nal;
+        int rc;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = ptl_hndl2nal (&ni);
+        if (nal == NULL) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_INV_HANDLE;
+        }
+
+        nal->refct--;
+        if (nal->refct > 0) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_OK;
+        }
+
+        ptl_me_ni_fini(nal);
+        ptl_eq_ni_fini(nal);
+
+        rc = PTL_OK;
+        if (nal->shutdown)
+                rc = nal->shutdown(nal, ni.nal_idx);
+
+        ptl_interfaces[ni.nal_idx] = NULL;
+        ptl_num_interfaces--;
+
+        ptl_ni_init_mutex_exit ();
+        return rc;
+}
+
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out)
+{
+        *ni_out = handle_in;
+
+        return PTL_OK;
+}
diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c
new file mode 100644 (file)
index 0000000..cbd4d1f
--- /dev/null
@@ -0,0 +1,601 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-wrap.c
+ * User-level wrappers that dispatch across the protection boundaries
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Assumes the handle encodes the network number in the second 16 bit word
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/api-support.h>
+
+static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
+                      int argsize, void *retbuf, int retsize)
+{
+        nal_t *nal;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlGetId: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&any_h);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize);
+
+        return PTL_OK;
+}
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
+{
+        PtlGetId_in args;
+        PtlGetId_out ret;
+        int rc;
+
+        args.handle_in = ni_handle;
+
+        rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return rc;
+        
+        if (id)
+                *id = ret.id_out;
+
+        return ret.rc;
+}
+
+int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
+{
+        PtlFailNid_in  args;
+        PtlFailNid_out ret;
+        int            rc;
+        
+        args.interface = interface;
+        args.nid       = nid;
+        args.threshold = threshold;
+        
+        rc = do_forward (interface, PTL_FAILNID, 
+                         &args, sizeof(args), &ret, sizeof (ret));
+
+        return ((rc != PTL_OK) ? rc : ret.rc);
+}
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out)
+{
+        PtlNIStatus_in args;
+        PtlNIStatus_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.register_in = register_in;
+
+        rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (status_out)
+                *status_out = ret.status_out;
+
+        return ret.rc;
+}
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out)
+{
+        PtlNIDist_in args;
+        PtlNIDist_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.process_in = process_in;
+
+        rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (distance_out)
+                *distance_out = ret.distance_out;
+
+        return ret.rc;
+}
+
+
+
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in)
+{
+        PtlNIDebug_in args;
+        PtlNIDebug_out ret;
+        int rc;
+
+        args.mask_in = mask_in;
+
+        rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out)
+{
+        PtlMEAttach_in args;
+        PtlMEAttach_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = pos_in;
+
+        rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = interface_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+
+        return ret.rc;
+}
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out)
+{
+        PtlMEInsert_in args;
+        PtlMEInsert_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = position_in;
+
+        rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = current_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMEUnlink(ptl_handle_me_t current_in)
+{
+        PtlMEUnlink_in args;
+        PtlMEUnlink_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.unlink_in = PTL_RETAIN;
+
+        rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in)
+{
+        PtlTblDump_in args;
+        PtlTblDump_out ret;
+        int rc;
+
+        args.index_in = index_in;
+
+        rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEDump(ptl_handle_me_t current_in)
+{
+        PtlMEDump_in args;
+        PtlMEDump_out ret;
+        int rc;
+
+        args.current_in = current_in;
+
+        rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
+{
+        nal_t *nal;
+        int rc;
+        int i;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&current_in);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        if (nal->validate != NULL)                /* nal->validate not a NOOP */
+        {
+                if ((md_in.options & PTL_MD_IOV) == 0)        /* contiguous */
+                {
+                        rc = nal->validate (nal, md_in.start, md_in.length);
+                        if (rc)
+                                return (PTL_SEGV);
+                }
+                else
+                {
+                        struct iovec *iov = (struct iovec *)md_in.start;
+
+                        for (i = 0; i < md_in.niov; i++, iov++)
+                        {
+                                rc = nal->validate (nal, iov->iov_base, iov->iov_len);
+                                if (rc)
+                                        return (PTL_SEGV);
+                        }
+                }
+        }
+
+        return 0;
+}
+
+static ptl_handle_eq_t md2eq (ptl_md_t *md)
+{
+        if (PtlHandleEqual (md->eventq, PTL_EQ_NONE))
+                return (PTL_EQ_NONE);
+        
+        return (ptl_handle2usereq (&md->eventq)->cb_eq_handle);
+}
+
+
+int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
+{
+        PtlMDAttach_in args;
+        PtlMDAttach_out ret;
+        int rc;
+
+        rc = validate_md(me_in, md_in);
+        if (rc == PTL_OK) {
+                args.eq_in = md2eq(&md_in);
+                args.me_in = me_in;
+                args.md_in = md_in;
+                args.unlink_in = unlink_in;
+                
+                rc = do_forward(me_in, PTL_MDATTACH, 
+                                &args, sizeof(args), &ret, sizeof(ret));
+        }
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = me_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+                       ptl_handle_md_t * handle_out)
+{
+        PtlMDBind_in args;
+        PtlMDBind_out ret;
+        int rc;
+
+        rc = validate_md(ni_in, md_in);
+        if (rc != PTL_OK)
+                return rc;
+
+        args.eq_in = md2eq(&md_in);
+        args.ni_in = ni_in;
+        args.md_in = md_in;
+
+        rc = do_forward(ni_in, PTL_MDBIND, 
+                        &args, sizeof(args), &ret, sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = ni_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
+                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
+{
+        PtlMDUpdate_internal_in args;
+        PtlMDUpdate_internal_out ret;
+        int rc;
+
+        args.md_in = md_in;
+
+        if (old_inout) {
+                args.old_inout = *old_inout;
+                args.old_inout_valid = 1;
+        } else
+                args.old_inout_valid = 0;
+
+        if (new_inout) {
+                rc = validate_md (md_in, *new_inout);
+                if (rc != PTL_OK)
+                        return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+                args.new_inout = *new_inout;
+                args.new_inout_valid = 1;
+        } else
+                args.new_inout_valid = 0;
+
+        if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) {
+                args.testq_in = PTL_EQ_NONE;
+                args.sequence_in = -1;
+        } else {
+                ptl_eq_t *eq = ptl_handle2usereq (&testq_in);
+                
+                args.testq_in = eq->cb_eq_handle;
+                args.sequence_in = eq->sequence;
+        }
+
+        rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        if (old_inout)
+                *old_inout = ret.old_inout;
+
+        return ret.rc;
+}
+
+int PtlMDUnlink(ptl_handle_md_t md_in)
+{
+        PtlMDUnlink_in args;
+        PtlMDUnlink_out ret;
+        int rc;
+
+        args.md_in = md_in;
+        rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        return ret.rc;
+}
+
+int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out)
+{
+        ptl_eq_t *eq = NULL;
+        ptl_event_t *ev = NULL;
+        PtlEQAlloc_in args;
+        PtlEQAlloc_out ret;
+        int rc, i;
+        nal_t *nal;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+        
+        nal = ptl_hndl2nal (&interface);
+        if (nal == NULL)
+                return PTL_INV_HANDLE;
+
+        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
+                do {                    /* knock off all but the top bit... */
+                        count &= ~LOWEST_BIT_SET (count);
+                } while (count != LOWEST_BIT_SET(count));
+
+                count <<= 1;                             /* ...and round up */
+        }
+
+        if (count == 0)        /* catch bad parameter / overflow on roundup */
+                return (PTL_VAL_FAILED);
+
+        PORTAL_ALLOC(ev, count * sizeof(ptl_event_t));
+        if (!ev)
+                return PTL_NOSPACE;
+
+        for (i = 0; i < count; i++)
+                ev[i].sequence = 0;
+
+        if (nal->validate != NULL) {
+                rc = nal->validate(nal, ev, count * sizeof(ptl_event_t));
+                if (rc != PTL_OK)
+                        goto fail;
+        }
+
+        args.ni_in = interface;
+        args.count_in = count;
+        args.base_in = ev;
+        args.len_in = count * sizeof(*ev);
+        args.callback_in = callback;
+
+        rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                goto fail;
+        if (ret.rc)
+                GOTO(fail, rc = ret.rc);
+
+        PORTAL_ALLOC(eq, sizeof(*eq));
+        if (!eq) {
+                rc = PTL_NOSPACE;
+                goto fail;
+        }
+
+        eq->sequence = 1;
+        eq->size = count;
+        eq->base = ev;
+
+        /* EQ handles are a little wierd.  PtlEQGet() just looks at the
+         * queued events in shared memory.  It doesn't want to do_forward()
+         * at all, so the cookie in the EQ handle we pass out of here is
+         * simply a pointer to the event queue we just set up.  We stash
+         * the handle returned by do_forward(), so we can pass it back via
+         * do_forward() when we need to. */
+
+        eq->cb_eq_handle.nal_idx = interface.nal_idx;
+        eq->cb_eq_handle.cookie = ret.handle_out.cookie;
+
+        handle_out->nal_idx = interface.nal_idx;
+        handle_out->cookie = (__u64)((unsigned long)eq);
+        return PTL_OK;
+
+fail:
+        PORTAL_FREE(ev, count * sizeof(ptl_event_t));
+        return rc;
+}
+
+int PtlEQFree(ptl_handle_eq_t eventq)
+{
+        PtlEQFree_in args;
+        PtlEQFree_out ret;
+        ptl_eq_t *eq;
+        int rc;
+
+        eq = ptl_handle2usereq (&eventq);
+        args.eventq_in = eq->cb_eq_handle;
+
+        rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args,
+                        sizeof(args), &ret, sizeof(ret));
+
+        /* XXX we're betting rc == PTL_OK here */
+        PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t));
+        PORTAL_FREE(eq, sizeof(*eq));
+
+        return rc;
+}
+
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
+{
+        PtlACEntry_in args;
+        PtlACEntry_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.ni_in = ni_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.portal_in = portal_in;
+
+        rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
+{
+        PtlPut_in args;
+        PtlPut_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.ack_req_in = ack_req_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+        args.hdr_data_in = hdr_data_in;
+
+        rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
+{
+        PtlGet_in args;
+        PtlGet_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+
+        rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
diff --git a/lnet/lnet/lib-dispatch.c b/lnet/lnet/lib-dispatch.c
new file mode 100644 (file)
index 0000000..63ed70f
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-dispatch.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/lib-dispatch.h>
+
+typedef struct {
+        int (*fun) (nal_cb_t * nal, void *private, void *in, void *out);
+        char *name;
+} dispatch_table_t;
+
+static dispatch_table_t dispatch_table[] = {
+        [PTL_GETID] {do_PtlGetId, "PtlGetId"},
+        [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"},
+        [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"},
+        [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"},
+        [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"},
+        [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"},
+        [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"},
+        [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"},
+        [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"},
+        [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"},
+        [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"},
+        [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"},
+        [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"},
+        [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"},
+        [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"},
+        [PTL_ACENTRY] {do_PtlACEntry, "PtlACEntry"},
+        [PTL_PUT] {do_PtlPut, "PtlPut"},
+        [PTL_GET] {do_PtlGet, "PtlGet"},
+        [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"},
+        /*    */ {0, ""}
+};
+
+/*
+ * This really should be elsewhere, but lib-p30/dispatch.c is
+ * an automatically generated file.
+ */
+void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block,
+                  void *ret_block)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (index < 0 || index > LIB_MAX_DISPATCH ||
+            !dispatch_table[index].fun) {
+                CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index);
+                return;
+        }
+
+        CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid,
+               dispatch_table[index].name, index);
+
+        dispatch_table[index].fun(nal, private, arg_block, ret_block);
+}
+
+char *dispatch_name(int index)
+{
+        return dispatch_table[index].name;
+}
diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c
new file mode 100644 (file)
index 0000000..4c6c292
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-eq.c
+ * Library level Event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args,
+                           void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_size_t count_in
+         *      void                    * base_in
+         *
+         * Outgoing:
+         *      ptl_handle_eq_t         * handle_out
+         */
+
+        PtlEQAlloc_in *args = v_args;
+        PtlEQAlloc_out *ret = v_ret;
+
+        lib_eq_t *eq;
+        unsigned long flags;
+
+        /* api should have rounded up */
+        if (args->count_in != LOWEST_BIT_SET (args->count_in))
+                return ret->rc = PTL_VAL_FAILED;
+
+        eq = lib_eq_alloc (nal);
+        if (eq == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        if (nal->cb_map != NULL) {
+                struct iovec iov = {
+                        .iov_base = args->base_in,
+                        .iov_len = args->count_in * sizeof (ptl_event_t) };
+
+                ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey);
+                if (ret->rc != PTL_OK) {
+                        lib_eq_free (nal, eq);
+                        
+                        state_unlock (nal, &flags);
+                        return (ret->rc);
+                }
+        }
+
+        eq->sequence = 1;
+        eq->base = args->base_in;
+        eq->size = args->count_in;
+        eq->eq_refcount = 0;
+        eq->event_callback = args->callback_in;
+
+        lib_initialise_handle (nal, &eq->eq_lh);
+        list_add (&eq->eq_list, &nal->ni.ni_active_eqs);
+
+        state_unlock(nal, &flags);
+
+        ptl_eq2handle(&ret->handle_out, eq);
+        return (ret->rc = PTL_OK);
+}
+
+int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args,
+                          void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_eq_t eventq_in
+         *
+         * Outgoing:
+         */
+
+        PtlEQFree_in *args = v_args;
+        PtlEQFree_out *ret = v_ret;
+        lib_eq_t *eq;
+        long flags;
+
+        state_lock (nal, &flags);
+
+        eq = ptl_handle2eq(&args->eventq_in, nal);
+        if (eq == NULL) {
+                ret->rc = PTL_INV_EQ;
+        } else if (eq->eq_refcount != 0) {
+                ret->rc = PTL_EQ_INUSE;
+        } else {
+                if (nal->cb_unmap != NULL) {
+                        struct iovec iov = {
+                                .iov_base = eq->base,
+                                .iov_len = eq->size * sizeof (ptl_event_t) };
+                        
+                        nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey);
+                }
+
+                lib_invalidate_handle (nal, &eq->eq_lh);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock (nal, &flags);
+
+        return (ret->rc);
+}
diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c
new file mode 100644 (file)
index 0000000..40f3d2c
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-init.c
+ * Start up the internal library and clear all structures
+ * Called by the NAL when it initializes.  Safe to call multiple times.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+
+#ifdef __KERNEL__
+# include <linux/string.h>      /* for memset() */
+# include <linux/kp30.h>
+# ifdef KERNEL_ADDR_CACHE
+#  include <compute/OS/addrCache/cache.h>
+# endif
+#else
+# include <string.h>
+# include <sys/time.h>
+#endif
+
+#ifdef PTL_USE_SLAB_CACHE
+static int ptl_slab_users;
+
+kmem_cache_t *ptl_md_slab;
+kmem_cache_t *ptl_msg_slab;
+kmem_cache_t *ptl_me_slab;
+kmem_cache_t *ptl_eq_slab;
+
+atomic_t md_in_use_count;
+atomic_t msg_in_use_count;
+atomic_t me_in_use_count;
+atomic_t eq_in_use_count;
+
+/* NB zeroing in ctor and on freeing ensures items that
+ * kmem_cache_validate() OK, but haven't been initialised
+ * as an MD/ME/EQ can't have valid handles
+ */
+static void
+ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_md_t));
+}
+
+static void
+ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_me_t));
+}
+
+static void
+ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_eq_t));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+
+        /* We'll have 1 set of slabs for ALL the nals :) */
+
+        if (ptl_slab_users++)
+                return 0;
+
+        ptl_md_slab = kmem_cache_create("portals_MD",
+                                        sizeof(lib_md_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_md_slab_ctor, NULL);
+        if (!ptl_md_slab) {
+                CERROR("couldn't allocate ptl_md_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        /* NB no ctor for msgs; they don't need handle verification */
+        ptl_msg_slab = kmem_cache_create("portals_MSG",
+                                         sizeof(lib_msg_t), 0,
+                                         SLAB_HWCACHE_ALIGN,
+                                         NULL, NULL);
+        if (!ptl_msg_slab) {
+                CERROR("couldn't allocate ptl_msg_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_me_slab = kmem_cache_create("portals_ME",
+                                        sizeof(lib_me_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_me_slab_ctor, NULL);
+        if (!ptl_me_slab) {
+                CERROR("couldn't allocate ptl_me_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_eq_slab = kmem_cache_create("portals_EQ",
+                                        sizeof(lib_eq_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_eq_slab_ctor, NULL);
+        if (!ptl_eq_slab) {
+                CERROR("couldn't allocate ptl_eq_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        RETURN(PTL_OK);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        if (--ptl_slab_users != 0)
+                return;
+
+        LASSERT (atomic_read (&md_in_use_count) == 0);
+        LASSERT (atomic_read (&me_in_use_count) == 0);
+        LASSERT (atomic_read (&eq_in_use_count) == 0);
+        LASSERT (atomic_read (&msg_in_use_count) == 0);
+
+        if (ptl_md_slab != NULL)
+                kmem_cache_destroy(ptl_md_slab);
+        if (ptl_msg_slab != NULL)
+                kmem_cache_destroy(ptl_msg_slab);
+        if (ptl_me_slab != NULL)
+                kmem_cache_destroy(ptl_me_slab);
+        if (ptl_eq_slab != NULL)
+                kmem_cache_destroy(ptl_eq_slab);
+}
+#else
+
+int
+lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
+{
+        char *space;
+
+        LASSERT (n > 0);
+
+        size += offsetof (lib_freeobj_t, fo_contents);
+
+        space = nal->cb_malloc (nal, n * size);
+        if (space == NULL)
+                return (PTL_NOSPACE);
+
+        INIT_LIST_HEAD (&fl->fl_list);
+        fl->fl_objs = space;
+        fl->fl_nobjs = n;
+        fl->fl_objsize = size;
+
+        do
+        {
+                memset (space, 0, size);
+                list_add ((struct list_head *)space, &fl->fl_list);
+                space += size;
+        } while (--n != 0);
+
+        return (PTL_OK);
+}
+
+void
+lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
+{
+        struct list_head *el;
+        int               count;
+
+        if (fl->fl_nobjs == 0)
+                return;
+
+        count = 0;
+        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+                count++;
+
+        LASSERT (count == fl->fl_nobjs);
+
+        nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        memset (fl, 0, sizeof (fl));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+        int rc;
+
+        memset (&nal->ni.ni_free_mes,  0, sizeof (nal->ni.ni_free_mes));
+        memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs));
+        memset (&nal->ni.ni_free_mds,  0, sizeof (nal->ni.ni_free_mds));
+        memset (&nal->ni.ni_free_eqs,  0, sizeof (nal->ni.ni_free_eqs));
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mes,
+                                MAX_MES, sizeof (lib_me_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs,
+                                MAX_MSGS, sizeof (lib_msg_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mds,
+                                MAX_MDS, sizeof (lib_md_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs,
+                                MAX_EQS, sizeof (lib_eq_t));
+        return (rc);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        lib_freelist_fini (nal, &nal->ni.ni_free_mes);
+        lib_freelist_fini (nal, &nal->ni.ni_free_msgs);
+        lib_freelist_fini (nal, &nal->ni.ni_free_mds);
+        lib_freelist_fini (nal, &nal->ni.ni_free_eqs);
+}
+
+#endif
+
+__u64
+lib_create_interface_cookie (nal_cb_t *nal)
+{
+        /* NB the interface cookie in wire handles guards against delayed
+         * replies and ACKs appearing valid in a new instance of the same
+         * interface.  Initialisation time, even if it's only implemented
+         * to millisecond resolution is probably easily good enough. */
+        struct timeval tv;
+        __u64          cookie;
+#ifndef __KERNEL__
+        int            rc = gettimeofday (&tv, NULL);
+        LASSERT (rc == 0);
+#else
+       do_gettimeofday(&tv);
+#endif
+        cookie = tv.tv_sec;
+        cookie *= 1000000;
+        cookie += tv.tv_usec;
+        return (cookie);
+}
+
+int
+lib_setup_handle_hash (nal_cb_t *nal) 
+{
+        lib_ni_t *ni = &nal->ni;
+        int       i;
+        
+        /* Arbitrary choice of hash table size */
+#ifdef __KERNEL__
+        ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head);
+#else
+        ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
+#endif
+        ni->ni_lh_hash_table = 
+                (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size
+                                                    * sizeof (struct list_head));
+        if (ni->ni_lh_hash_table == NULL)
+                return (PTL_NOSPACE);
+        
+        for (i = 0; i < ni->ni_lh_hash_size; i++)
+                INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
+
+        ni->ni_next_object_cookie = 0;
+        
+        return (PTL_OK);
+}
+
+void
+lib_cleanup_handle_hash (nal_cb_t *nal)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->ni_lh_hash_table == NULL)
+                return;
+        
+        nal->cb_free (nal, ni->ni_lh_hash_table,
+                      ni->ni_lh_hash_size * sizeof (struct list_head));
+}
+
+lib_handle_t *
+lib_lookup_cookie (nal_cb_t *nal, __u64 cookie) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t            *ni = &nal->ni;
+        struct list_head    *list;
+        struct list_head    *el;
+        unsigned int         hash;
+
+        hash = ((unsigned int)cookie) % ni->ni_lh_hash_size;
+        list = &ni->ni_lh_hash_table[hash];
+        
+        list_for_each (el, list) {
+                lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain);
+                
+                if (lh->lh_cookie == cookie)
+                        return (lh);
+        }
+        
+        return (NULL);
+}
+
+void
+lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t       *ni = &nal->ni;
+        unsigned int    hash;
+        
+        lh->lh_cookie = ni->ni_next_object_cookie++;
+        hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size;
+        list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]);
+}
+
+void
+lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh)
+{
+        list_del (&lh->lh_hash_chain);
+}
+
+int
+lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+         ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size)
+{
+        int       rc = PTL_OK;
+        lib_ni_t *ni = &nal->ni;
+        int i;
+        ENTRY;
+
+        /* NB serialised in PtlNIInit() */
+
+        if (ni->refcnt != 0) {                       /* already initialised */
+                ni->refcnt++;
+                goto out;
+        }
+
+        /*
+         * Allocate the portal table for this interface
+         * and all per-interface objects.
+         */
+        memset(&ni->counters, 0, sizeof(lib_counters_t));
+
+        rc = kportal_descriptor_setup (nal);
+        if (rc != PTL_OK)
+                goto out;
+
+        INIT_LIST_HEAD (&ni->ni_active_msgs);
+        INIT_LIST_HEAD (&ni->ni_active_mds);
+        INIT_LIST_HEAD (&ni->ni_active_eqs);
+
+        INIT_LIST_HEAD (&ni->ni_test_peers);
+
+        ni->ni_interface_cookie = lib_create_interface_cookie (nal);
+        ni->ni_next_object_cookie = 0;
+        rc = lib_setup_handle_hash (nal);
+        if (rc != PTL_OK)
+                goto out;
+        
+        ni->nid = nid;
+        ni->pid = pid;
+
+        ni->num_nodes = gsize;
+        ni->tbl.size = ptl_size;
+
+        ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size);
+        if (ni->tbl.tbl == NULL) {
+                rc = PTL_NOSPACE;
+                goto out;
+        }
+
+        for (i = 0; i < ptl_size; i++)
+                INIT_LIST_HEAD(&(ni->tbl.tbl[i]));
+
+        ni->debug = PTL_DEBUG_NONE;
+        ni->up = 1;
+        ni->refcnt++;
+
+ out:
+        if (rc != PTL_OK) {
+                lib_cleanup_handle_hash (nal);
+                kportal_descriptor_cleanup (nal);
+        }
+
+        RETURN (rc);
+}
+
+int
+lib_fini(nal_cb_t * nal)
+{
+        lib_ni_t *ni = &nal->ni;
+        int       idx;
+
+        ni->refcnt--;
+
+        if (ni->refcnt != 0)
+                goto out;
+
+        /* NB no stat_lock() since this is the last reference.  The NAL
+         * should have shut down already, so it should be safe to unlink
+         * and free all descriptors, even those that appear committed to a
+         * network op (eg MD with non-zero pending count)
+         */
+
+        for (idx = 0; idx < ni->tbl.size; idx++)
+                while (!list_empty (&ni->tbl.tbl[idx])) {
+                        lib_me_t *me = list_entry (ni->tbl.tbl[idx].next,
+                                                   lib_me_t, me_list);
+
+                        CERROR ("Active me %p on exit\n", me);
+                        list_del (&me->me_list);
+                        lib_me_free (nal, me);
+                }
+
+        while (!list_empty (&ni->ni_active_mds)) {
+                lib_md_t *md = list_entry (ni->ni_active_mds.next,
+                                           lib_md_t, md_list);
+
+                CERROR ("Active md %p on exit\n", md);
+                list_del (&md->md_list);
+                lib_md_free (nal, md);
+        }
+
+        while (!list_empty (&ni->ni_active_eqs)) {
+                lib_eq_t *eq = list_entry (ni->ni_active_eqs.next,
+                                           lib_eq_t, eq_list);
+
+                CERROR ("Active eq %p on exit\n", eq);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+        }
+
+        while (!list_empty (&ni->ni_active_msgs)) {
+                lib_msg_t *msg = list_entry (ni->ni_active_msgs.next,
+                                             lib_msg_t, msg_list);
+
+                CERROR ("Active msg %p on exit\n", msg);
+                list_del (&msg->msg_list);
+                lib_msg_free (nal, msg);
+        }
+
+        nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size);
+        ni->up = 0;
+
+        lib_cleanup_handle_hash (nal);
+        kportal_descriptor_cleanup (nal);
+
+ out:
+        return (PTL_OK);
+}
diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c
new file mode 100644 (file)
index 0000000..d171050
--- /dev/null
@@ -0,0 +1,412 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-md.c
+ * Memory Descriptor management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * must be called with state lock held
+ */
+void lib_md_unlink(nal_cb_t * nal, lib_md_t * md)
+{
+        lib_me_t *me = md->me;
+
+        if (md->pending != 0) {
+                CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+                md->md_flags |= PTL_MD_FLAG_UNLINK;
+                return;
+        }
+
+        CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+        if ((md->options & PTL_MD_KIOV) != 0) {
+                if (nal->cb_unmap_pages != NULL)
+                        nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, 
+                                             &md->md_addrkey);
+        } else if (nal->cb_unmap != NULL)
+                nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, 
+                               &md->md_addrkey);
+
+        if (me) {
+                me->md = NULL;
+                if (me->unlink == PTL_UNLINK)
+                        lib_me_unlink(nal, me);
+        }
+
+        if (md->eq != NULL)
+        {
+                md->eq->eq_refcount--;
+                LASSERT (md->eq->eq_refcount >= 0);
+        }
+
+        lib_invalidate_handle (nal, &md->md_lh);
+        list_del (&md->md_list);
+        lib_md_free(nal, md);
+}
+
+/* must be called with state lock held */
+static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
+                        ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink)
+{
+        const int     max_size_opts = PTL_MD_AUTO_UNLINK |
+                                      PTL_MD_MAX_SIZE;
+        lib_eq_t     *eq = NULL;
+        int           rc;
+        int           i;
+
+        /* NB we are passes an allocated, but uninitialised/active md.
+         * if we return success, caller may lib_md_unlink() it.
+         * otherwise caller may only lib_md_free() it.
+         */
+
+        if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) {
+                eq = ptl_handle2eq(eqh, nal);
+                if (eq == NULL)
+                        return PTL_INV_EQ;
+        }
+
+        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
+            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
+                return PTL_IOV_TOO_MANY;
+
+        if ((md->options & max_size_opts) != 0 && /* max size used */
+            (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
+                return PTL_INV_MD;
+
+        new->me = NULL;
+        new->start = md->start;
+        new->length = md->length;
+        new->offset = 0;
+        new->max_size = md->max_size;
+        new->unlink = unlink;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        new->eq = eq;
+        new->threshold = md->threshold;
+        new->pending = 0;
+        new->md_flags = 0;
+
+        if ((md->options & PTL_MD_IOV) != 0) {
+                int total_length = 0;
+
+                if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */
+                        return PTL_INV_MD; 
+
+                new->md_niov = md->niov;
+                
+                if (nal->cb_read (nal, private, new->md_iov.iov, md->start,
+                                  md->niov * sizeof (new->md_iov.iov[0])))
+                        return PTL_SEGV;
+
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the base address on trust */
+                        if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                                return PTL_VAL_FAILED;
+
+                        total_length += new->md_iov.iov[i].iov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+                
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } else if ((md->options & PTL_MD_KIOV) != 0) {
+#ifndef __KERNEL__
+                return PTL_INV_MD;
+#else
+                int total_length = 0;
+                
+                /* Trap attempt to use paged I/O if unsupported early. */
+                if (nal->cb_send_pages == NULL ||
+                    nal->cb_recv_pages == NULL)
+                        return PTL_INV_MD;
+
+                new->md_niov = md->niov;
+
+                if (nal->cb_read (nal, private, new->md_iov.kiov, md->start,
+                                  md->niov * sizeof (new->md_iov.kiov[0])))
+                        return PTL_SEGV;
+                
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the page pointer on trust */
+                        if (new->md_iov.kiov[i].kiov_offset + 
+                            new->md_iov.kiov[i].kiov_len > PAGE_SIZE )
+                                return PTL_VAL_FAILED; /* invalid length */
+
+                        total_length += new->md_iov.kiov[i].kiov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+
+                if (nal->cb_map_pages != NULL) {
+                        rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, 
+                                                &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+#endif
+        } else {   /* contiguous */
+                new->md_niov = 1;
+                new->md_iov.iov[0].iov_base = md->start;
+                new->md_iov.iov[0].iov_len = md->length;
+
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } 
+
+        if (eq != NULL)
+                eq->eq_refcount++;
+
+        /* It's good; let handle2md succeed and add to active mds */
+        lib_initialise_handle (nal, &new->md_lh);
+        list_add (&new->md_list, &nal->ni.ni_active_mds);
+
+        return PTL_OK;
+}
+
+/* must be called with state lock held */
+void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new)
+{
+        /* NB this doesn't copy out all the iov entries so when a
+         * discontiguous MD is copied out, the target gets to know the
+         * original iov pointer (in start) and the number of entries it had
+         * and that's all.
+         */
+        new->start = md->start;
+        new->length = md->length;
+        new->threshold = md->threshold;
+        new->max_size = md->max_size;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        ptl_eq2handle(&new->eventq, md->eq);
+        new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov;
+}
+
+int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_me_t current_in
+         *      ptl_md_t md_in
+         *      ptl_unlink_t unlink_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDAttach_in *args = v_args;
+        PtlMDAttach_out *ret = v_ret;
+        lib_me_t *me;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->me_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else if (me->md != NULL) {
+                ret->rc = PTL_INUSE;
+        } else {
+                ret->rc = lib_md_build(nal, md, private, &args->md_in,
+                                       &args->eq_in, args->unlink_in);
+
+                if (ret->rc == PTL_OK) {
+                        me->md = md;
+                        md->me = me;
+
+                        ptl_md2handle(&ret->handle_out, md);
+
+                        state_unlock (nal, &flags);
+                        return (PTL_OK);
+                }
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock (nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_md_t md_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDBind_in *args = v_args;
+        PtlMDBind_out *ret = v_ret;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        ret->rc = lib_md_build(nal, md, private,
+                               &args->md_in, &args->eq_in, PTL_UNLINK);
+
+        if (ret->rc == PTL_OK) {
+                ptl_md2handle(&ret->handle_out, md);
+
+                state_unlock(nal, &flags);
+                return (PTL_OK);
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_out *ret = v_ret;
+
+        lib_md_t *md;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                ret->rc = PTL_INV_MD;
+        } else if (md->pending != 0) {           /* being filled/spilled */
+                ret->rc = PTL_MD_INUSE;
+        } else {
+                /* Callers attempting to unlink a busy MD which will get
+                 * unlinked once the net op completes should see INUSE,
+                 * before completion and INV_MD thereafter.  LASSERT we've
+                 * got that right... */
+                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
+
+                lib_md_deconstruct(nal, md, &ret->status_out);
+                lib_md_unlink(nal, md);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
+                            void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         *      ptl_handle_eq_t testq_in
+         *      ptl_seq_t               sequence_in
+         *
+         * Outgoing:
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         */
+        PtlMDUpdate_internal_in *args = v_args;
+        PtlMDUpdate_internal_out *ret = v_ret;
+        lib_md_t *md;
+        lib_eq_t *test_eq = NULL;
+        ptl_md_t *new = &args->new_inout;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                 ret->rc = PTL_INV_MD;
+                 goto out;
+        }
+
+        if (args->old_inout_valid)
+                lib_md_deconstruct(nal, md, &ret->old_inout);
+
+        if (!args->new_inout_valid) {
+                ret->rc = PTL_OK;
+                goto out;
+        }
+
+        if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
+                test_eq = ptl_handle2eq(&args->testq_in, nal);
+                if (test_eq == NULL) {
+                        ret->rc = PTL_INV_EQ;
+                        goto out;
+                }
+        }
+
+        if (md->pending != 0) {
+                        ret->rc = PTL_NOUPDATE;
+                        goto out;
+        }
+
+        if (test_eq == NULL ||
+            test_eq->sequence == args->sequence_in) {
+                lib_me_t *me = md->me;
+
+#warning this does not track eq refcounts properly
+
+                ret->rc = lib_md_build(nal, md, private,
+                                       new, &new->eventq, md->unlink);
+
+                md->me = me;
+        } else {
+                ret->rc = PTL_NOUPDATE;
+        }
+
+ out:
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c
new file mode 100644 (file)
index 0000000..34fb606
--- /dev/null
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-me.c
+ * Match Entry management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me);
+
+int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEAttach_in *args = v_args;
+        PtlMEAttach_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_ptl_t *tbl = &ni->tbl;
+        unsigned long flags;
+        lib_me_t *me;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        /* Should check for valid matchid, but not yet */
+        if (0)
+                return ret->rc = PTL_INV_PROC;
+
+        me = lib_me_alloc (nal);
+        if (me == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me->match_id = args->match_id_in;
+        me->match_bits = args->match_bits_in;
+        me->ignore_bits = args->ignore_bits_in;
+        me->unlink = args->unlink_in;
+        me->md = NULL;
+
+        lib_initialise_handle (nal, &me->me_lh);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&me->me_list, &(tbl->tbl[args->index_in]));
+        else
+                list_add(&me->me_list, &(tbl->tbl[args->index_in]));
+
+        ptl_me2handle(&ret->handle_out, me);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEInsert_in *args = v_args;
+        PtlMEInsert_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+        lib_me_t *new;
+
+        new = lib_me_alloc (nal);
+        if (new == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        /* Should check for valid matchid, but not yet */
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                lib_me_free (nal, new);
+
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_INV_ME);
+        }
+
+        new->match_id = args->match_id_in;
+        new->match_bits = args->match_bits_in;
+        new->ignore_bits = args->ignore_bits_in;
+        new->unlink = args->unlink_in;
+        new->md = NULL;
+
+        lib_initialise_handle (nal, &new->me_lh);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&new->me_list, &me->me_list);
+        else
+                list_add(&new->me_list, &me->me_list);
+
+        ptl_me2handle(&ret->handle_out, new);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEUnlink_in *args = v_args;
+        PtlMEUnlink_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_unlink(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+/* call with state_lock please */
+void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->debug & PTL_DEBUG_UNLINK) {
+                ptl_handle_any_t handle;
+                ptl_me2handle(&handle, me);
+        }
+
+        list_del (&me->me_list);
+
+        if (me->md) {
+                me->md->me = NULL;
+                lib_md_unlink(nal, me->md);
+        }
+
+        lib_invalidate_handle (nal, &me->me_lh);
+        lib_me_free(nal, me);
+}
+
+int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlTblDump_in *args = v_args;
+        PtlTblDump_out *ret = v_ret;
+        lib_ptl_t *tbl = &nal->ni.tbl;
+        ptl_handle_any_t handle;
+        struct list_head *tmp;
+        unsigned long flags;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        nal->cb_printf(nal, "Portal table index %d\n", args->index_in);
+
+        state_lock(nal, &flags);
+        list_for_each(tmp, &(tbl->tbl[args->index_in])) {
+                lib_me_t *me = list_entry(tmp, lib_me_t, me_list);
+                ptl_me2handle(&handle, me);
+                lib_me_dump(nal, me);
+        }
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEDump_in *args = v_args;
+        PtlMEDump_out *ret = v_ret;
+        lib_me_t *me;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_dump(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return ret->rc;
+}
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me)
+{
+        nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, 
+                       me->me_lh.lh_cookie);
+
+        nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n",
+                       me->match_bits, me->ignore_bits);
+
+        nal->cb_printf(nal, "\tMD\t= %p\n", me->md);
+        nal->cb_printf(nal, "\tprev\t= %p\n",
+                       list_entry(me->me_list.prev, lib_me_t, me_list));
+        nal->cb_printf(nal, "\tnext\t= %p\n",
+                       list_entry(me->me_list.next, lib_me_t, me_list));
+}
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c
new file mode 100644 (file)
index 0000000..7ba1664
--- /dev/null
@@ -0,0 +1,1287 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-move.c
+ * Data movement routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * Right now it does not check access control lists.
+ *
+ * We only support one MD per ME, which is how the Portals 3.1 spec is written.
+ * All previous complication is removed.
+ */
+
+static lib_me_t *
+lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
+            ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset,
+            ptl_match_bits_t match_bits, ptl_size_t *mlength_out,
+            ptl_size_t *offset_out, int *unlink_out)
+{
+        lib_ni_t         *ni = &nal->ni;
+        struct list_head *match_list = &ni->tbl.tbl[index];
+        struct list_head *tmp;
+        lib_me_t         *me;
+        lib_md_t         *md;
+        ptl_size_t        mlength;
+        ptl_size_t        offset;
+
+        ENTRY;
+
+        CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
+                "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
+
+        if (index < 0 || index >= ni->tbl.size) {
+                CERROR("Invalid portal %d not in [0-%d]\n",
+                       index, ni->tbl.size);
+                goto failed;
+        }
+
+        list_for_each (tmp, match_list) {
+                me = list_entry(tmp, lib_me_t, me_list);
+                md = me->md;
+
+                 /* ME attached but MD not attached yet */
+                if (md == NULL)
+                        continue;
+
+                LASSERT (me == md->me);
+
+                /* MD deactivated */
+                if (md->threshold == 0)
+                        continue;
+
+                /* mismatched MD op */
+                if ((md->options & op_mask) == 0)
+                        continue;
+
+                /* mismatched ME nid/pid? */
+                if (me->match_id.nid != PTL_NID_ANY &&
+                    me->match_id.nid != src_nid)
+                        continue;
+
+                if (me->match_id.pid != PTL_PID_ANY &&
+                    me->match_id.pid != src_pid)
+                        continue;
+
+                /* mismatched ME matchbits? */
+                if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0)
+                        continue;
+
+                /* Hurrah! This _is_ a match; check it out... */
+
+                if ((md->options & PTL_MD_MANAGE_REMOTE) == 0)
+                        offset = md->offset;
+                else
+                        offset = roffset;
+
+                mlength = md->length - offset;
+                if ((md->options & PTL_MD_MAX_SIZE) != 0 &&
+                    mlength > md->max_size)
+                        mlength = md->max_size;
+
+                if (rlength <= mlength) {        /* fits in allowed space */
+                        mlength = rlength;
+                } else if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        /* this packet _really_ is too big */
+                        CERROR("Matching packet %d too big: %d left, "
+                               "%d allowed\n", rlength, md->length - offset,
+                               mlength);
+                        goto failed;
+                }
+
+                md->offset = offset + mlength;
+
+                *offset_out = offset;
+                *mlength_out = mlength;
+                *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 &&
+                               md->offset >= (md->length - md->max_size));
+                RETURN (me);
+        }
+
+ failed:
+        CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
+                " offset %d length %d: no match\n",
+                ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
+                src_nid, src_pid, index, match_bits, roffset, rlength);
+        RETURN(NULL);
+}
+
+int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+{
+        PtlFailNid_in     *args = v_args;
+        PtlFailNid_out    *ret  = v_ret;
+        lib_test_peer_t   *tp;
+        unsigned long      flags;
+        struct list_head  *el;
+        struct list_head  *next;
+        struct list_head   cull;
+        
+        if (args->threshold != 0) {
+                /* Adding a new entry */
+                tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp));
+                if (tp == NULL)
+                        return (ret->rc = PTL_FAIL);
+                
+                tp->tp_nid = args->nid;
+                tp->tp_threshold = args->threshold;
+                
+                state_lock (nal, &flags);
+                list_add (&tp->tp_list, &nal->ni.ni_test_peers);
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_OK);
+        }
+        
+        /* removing entries */
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+                
+                if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                    args->nid == PTL_NID_ANY || /* removing all entries */
+                    tp->tp_nid == args->nid)    /* matched this one */
+                {
+                        list_del (&tp->tp_list);
+                        list_add (&tp->tp_list, &cull);
+                }
+        }
+        
+        state_unlock (nal, &flags);
+                
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+
+                list_del (&tp->tp_list);
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+        return (ret->rc = PTL_OK);
+}
+
+static int
+fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) 
+{
+        lib_test_peer_t  *tp;
+        struct list_head *el;
+        struct list_head *next;
+        unsigned long     flags;
+        struct list_head  cull;
+        int               fail = 0;
+
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+
+                if (tp->tp_threshold == 0) {
+                        /* zombie entry */
+                        if (outgoing) {
+                                /* only cull zombies on outgoing tests,
+                                 * since we may be at interrupt priority on
+                                 * incoming messages. */
+                                list_del (&tp->tp_list);
+                                list_add (&tp->tp_list, &cull);
+                        }
+                        continue;
+                }
+                        
+                if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */
+                    nid == tp->tp_nid) {        /* fail this peer */
+                        fail = 1;
+                        
+                        if (tp->tp_threshold != PTL_MD_THRESH_INF) {
+                                tp->tp_threshold--;
+                                if (outgoing &&
+                                    tp->tp_threshold == 0) {
+                                        /* see above */
+                                        list_del (&tp->tp_list);
+                                        list_add (&tp->tp_list, &cull);
+                                }
+                        }
+                        break;
+                }
+        }
+        
+        state_unlock (nal, &flags);
+
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                list_del (&tp->tp_list);
+                
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+
+        return (fail);
+}
+
+ptl_size_t
+lib_iov_nob (int niov, struct iovec *iov)
+{
+        ptl_size_t nob = 0;
+        
+        while (niov-- > 0)
+                nob += (iov++)->iov_len;
+        
+        return (nob);
+}
+
+void
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (dest, iov->iov_base, nob);
+
+                len -= nob;
+                dest += nob;
+                niov--;
+                iov++;
+        }
+}
+
+void
+lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (iov->iov_base, src, nob);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                iov++;
+        }
+}
+
+static int
+lib_extract_iov (struct iovec *dst, lib_md_t *md,
+                 ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        struct iovec   *src = md->md_iov.iov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->iov_len) {      /* skip initial frags */
+                offset -= src->iov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->iov_len - offset;
+                dst->iov_base = ((char *)src->iov_base) + offset;
+
+                if (len <= frag_len) {
+                        dst->iov_len = len;
+                        return (dst_niov);
+                }
+                
+                dst->iov_len = frag_len;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+
+#ifndef __KERNEL__
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        LASSERT (0);
+        return (0);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+#else
+
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        ptl_size_t  nob = 0;
+
+        while (niov-- > 0)
+                nob += (kiov++)->kiov_len;
+
+        return (nob);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+        
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (dest, addr, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                dest += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (addr, src, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        ptl_kiov_t     *src = md->md_iov.kiov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->kiov_len) {      /* skip initial frags */
+                offset -= src->kiov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->kiov_len - offset;
+                dst->kiov_page = src->kiov_page;
+                dst->kiov_offset = src->kiov_offset + offset;
+
+                if (len <= frag_len) {
+                        dst->kiov_len = len;
+                        LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                        return (dst_niov);
+                }
+
+                dst->kiov_len = frag_len;
+                LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+#endif
+
+void
+lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+          ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+{
+        int   niov;
+
+        if (mlen == 0)
+                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
+        else if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
+                nal->cb_recv (nal, private, msg,
+                              niov, msg->msg_iov.iov, mlen, rlen);
+        } else {
+                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
+                nal->cb_recv_pages (nal, private, msg, 
+                                    niov, msg->msg_iov.kiov, mlen, rlen);
+        }
+}
+
+int
+lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+          ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+          lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
+{
+        int   niov;
+
+        if (len == 0)
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      0, NULL, 0));
+        
+        if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      niov, msg->msg_iov.iov, len));
+        }
+
+        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
+        return (nal->cb_send_pages (nal, private, msg, 
+                                    hdr, type, nid, pid,
+                                    niov, msg->msg_iov.kiov, len));
+}
+
+static lib_msg_t *
+get_new_msg (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called holding the state_lock */
+        lib_counters_t *counters = &nal->ni.counters;
+        lib_msg_t      *msg      = lib_msg_alloc (nal);
+
+        if (msg == NULL)
+                return (NULL);
+
+        memset (msg, 0, sizeof (*msg));
+
+        msg->send_ack = 0;
+
+        msg->md = md;
+        msg->ev.arrival_time = get_cycles();
+        md->pending++;
+        if (md->threshold != PTL_MD_THRESH_INF) {
+                LASSERT (md->threshold > 0);
+                md->threshold--;
+        }
+
+        counters->msgs_alloc++;
+        if (counters->msgs_alloc > counters->msgs_max)
+                counters->msgs_max = counters->msgs_alloc;
+
+        list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+
+        return (msg);
+}
+
+
+/*
+ * Incoming messages have a ptl_msg_t object associated with them
+ * by the library.  This object encapsulates the state of the
+ * message and allows the NAL to do non-blocking receives or sends
+ * of long messages.
+ *
+ */
+static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* Convert put fields to host byte order */
+        hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
+                         hdr->src_nid, hdr->src_pid,
+                         PTL_HDR_LENGTH (hdr), hdr->msg.put.offset,
+                         hdr->msg.put.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
+               "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+            !(md->options & PTL_MD_ACK_DISABLE)) {
+                msg->send_ack = 1;
+                msg->ack_wmd = hdr->msg.put.ack_wmd;
+                msg->nid = hdr->src_nid;
+                msg->pid = hdr->src_pid;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_PUT;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.put.ptl_index;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += mlength;
+
+        /* only unlink after MD's pending count has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        ptl_hdr_t        reply;
+        unsigned long    flags;
+        int              rc;
+
+        /* Convert get fields to host byte order */
+        hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits);
+        hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index);
+        hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length);
+        hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset);
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.get.return_offset != 0)
+                CERROR("Unexpected non-zero get.return_offset %x from "
+                       LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
+                         hdr->src_nid, hdr->src_pid,
+                         hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                         hdr->msg.get.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
+               "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_GET;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.get.ptl_index;
+                msg->ev.match_bits = hdr->msg.get.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = 0;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.send_count++;
+        ni->counters.send_length += mlength;
+
+        /* only unlink after MD's refcount has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        memset (&reply, 0, sizeof (reply));
+        reply.type     = HTON__u32 (PTL_MSG_REPLY);
+        reply.dest_nid = HTON__u64 (hdr->src_nid);
+        reply.src_nid  = HTON__u64 (ni->nid);
+        reply.dest_pid = HTON__u32 (hdr->src_pid);
+        reply.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength);
+
+        reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
+
+        rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
+                       hdr->src_nid, hdr->src_pid, md, offset, mlength);
+        if (rc != 0) {
+                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
+                       ni->nid, hdr->src_nid);
+                state_lock (nal, &flags);
+                goto drop;
+        }
+
+        /* Complete the incoming message */
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return (rc);
+ drop:
+        ni->counters.drop_count++;
+        ni->counters.drop_length += hdr->msg.get.sink_length;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        lib_md_t        *md;
+        int              rlength;
+        int              length;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.reply.dst_offset != 0)
+                CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n",
+                       hdr->msg.reply.dst_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
+                        ni->nid, hdr->src_nid,
+                        md == NULL ? "invalid" : "inactive",
+                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                        hdr->msg.reply.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        LASSERT (md->offset == 0);
+
+        length = rlength = PTL_HDR_LENGTH(hdr);
+
+        if (length > md->length) {
+                if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        CERROR (LPU64": Dropping REPLY from "LPU64
+                                " length %d for MD "LPX64" would overflow (%d)\n",
+                                ni->nid, hdr->src_nid, length,
+                                hdr->msg.reply.dst_wmd.wh_object_cookie,
+                                md->length);
+                        goto drop;
+                }
+                length = md->length;
+        }
+
+        CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n",
+               hdr->src_nid, length, rlength, 
+               hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
+                       "allocate msg\n", ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_REPLY;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.rlength = rlength;
+                msg->ev.mlength = length;
+                msg->ev.offset = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += length;
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, 0, length, rlength);
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        unsigned long flags;
+
+        /* Convert ack fields to host byte order */
+        hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
+        hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR(LPU64": Dropping ACK from "LPU64" to %s MD "
+                       LPX64"."LPX64"\n", ni->nid, hdr->src_nid, 
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
+               ni->nid, hdr->src_nid, 
+               hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_ACK;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.mlength = hdr->msg.ack.mlength;
+                msg->ev.match_bits = hdr->msg.ack.match_bits;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        case PTL_MSG_HELLO:
+                return ("HELLO");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str);
+        nal->cb_printf(nal, "    From nid/pid %Lu/%Lu", hdr->src_nid,
+                       hdr->src_pid);
+        nal->cb_printf(nal, "    To nid/pid %Lu/%Lu\n", hdr->dest_nid,
+                       hdr->dest_pid);
+
+        switch (hdr->type) {
+        default:
+                break;
+
+        case PTL_MSG_PUT:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, ack md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n",
+                               hdr->msg.put.ptl_index,
+                               hdr->msg.put.ack_wmd.wh_interface_cookie,
+                               hdr->msg.put.ack_wmd.wh_object_cookie,
+                               hdr->msg.put.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, offset %d, hdr data "LPX64"\n",
+                               PTL_HDR_LENGTH(hdr), hdr->msg.put.offset,
+                               hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, return md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                               hdr->msg.get.return_wmd.wh_interface_cookie,
+                               hdr->msg.get.return_wmd.wh_object_cookie,
+                               hdr->msg.get.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, src offset %d\n",
+                               hdr->msg.get.sink_length,
+                               hdr->msg.get.src_offset);
+                break;
+
+        case PTL_MSG_ACK:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "manipulated length %d\n",
+                               hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                               hdr->msg.ack.dst_wmd.wh_object_cookie,
+                               hdr->msg.ack.mlength);
+                break;
+
+        case PTL_MSG_REPLY:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "length %d\n",
+                               hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                               hdr->msg.reply.dst_wmd.wh_object_cookie,
+                               PTL_HDR_LENGTH(hdr));
+        }
+
+}                               /* end of print_hdr() */
+
+
+int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        unsigned long  flags;
+
+        /* NB static check; optimizer will elide this if it's right */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.put.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.get.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.reply.length));
+
+        /* convert common fields to host byte order */
+        hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
+        hdr->src_nid = NTOH__u64 (hdr->src_nid);
+        hdr->dest_pid = NTOH__u32 (hdr->dest_pid);
+        hdr->src_pid = NTOH__u32 (hdr->src_pid);
+        hdr->type = NTOH__u32 (hdr->type);
+        PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr));
+#if 0
+        nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n",
+                       nal->ni.nid, nal, hdr, hdr->type);
+        print_hdr(nal, hdr);
+#endif
+        if (hdr->type == PTL_MSG_HELLO) {
+                /* dest_nid is really ptl_magicversion_t */
+                ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
+
+                CERROR (LPU64": Dropping unexpected HELLO message: "
+                        "magic %d, version %d.%d from "LPD64"\n",
+                        nal->ni.nid, mv->magic, 
+                        mv->version_major, mv->version_minor,
+                        hdr->src_nid);
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+        
+        if (hdr->dest_nid != nal->ni.nid) {
+                CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
+                       " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
+                       hdr->src_nid, hdr->dest_nid);
+
+                state_lock (nal, &flags);
+                nal->ni.counters.drop_count++;
+                nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+                state_unlock (nal, &flags);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
+        {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": simulated failure\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                return (-1);
+        }
+        
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return (parse_ack(nal, hdr, private));
+        case PTL_MSG_PUT:
+                return (parse_put(nal, hdr, private));
+                break;
+        case PTL_MSG_GET:
+                return (parse_get(nal, hdr, private));
+                break;
+        case PTL_MSG_REPLY:
+                return (parse_reply(nal, hdr, private));
+                break;
+        default:
+                CERROR(LPU64": Dropping <unknown> message from "LPU64
+                       ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
+                       hdr->type);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+}
+
+
+int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_ack_req_t ack_req_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlPut_in *args = v_args;
+        PtlPut_out *ret = v_ret;
+        ptl_hdr_t hdr;
+
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        ptl_process_id_t *id = &args->target_in;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        ret->rc = PTL_OK;
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_PUT);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length);
+
+        /* NB handles only looked up by creator (no flips) */
+        if (args->ack_req_in == PTL_ACK_REQ) {
+                hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+                hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        } else {
+                hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+
+        hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.put.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.put.offset = HTON__u32 (args->offset_in);
+        hdr.msg.put.hdr_data = args->hdr_data_in;
+
+        ni->counters.send_count++;
+        ni->counters.send_length += md->length;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("BAD: could not allocate msg!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we need to allocate a message state object and record the
+         * information about this operation that will be recorded into
+         * event queue once the message has been completed.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = args->hdr_data_in;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+        
+        lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
+                  id->nid, id->pid, md, 0, md->length);
+
+        return ret->rc = PTL_OK;
+}
+
+
+int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlGet_in *args = v_args;
+        PtlGet_out *ret = v_ret;
+        ptl_hdr_t hdr;
+        lib_msg_t *msg = NULL;
+        lib_ni_t *ni = &nal->ni;
+        ptl_process_id_t *id = &args->target_in;
+        lib_md_t *md;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        LASSERT (md->offset == 0);
+
+        CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_GET);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = 0;
+
+        /* NB handles only looked up by creator (no flips) */
+        hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+        hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+
+        hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.get.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
+        hdr.msg.get.sink_length = HTON__u32 (md->length);
+
+        ni->counters.send_count++;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we must allocate a message state object that will record
+         * the information to be filled in once the message has been
+         * completed.  More information is in the do_PtlPut() comments.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
+                  id->nid, id->pid, NULL, 0, 0);
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c
new file mode 100644 (file)
index 0000000..20a6c66
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-msg.c
+ * Message decoding, parsing and finalizing routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+
+int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+{
+        lib_md_t     *md;
+        lib_eq_t     *eq;
+        int           rc;
+        unsigned long flags;
+
+        /* ni went down while processing this message */
+        if (nal->ni.up == 0) {
+                return -1;
+        }
+
+        if (msg == NULL)
+                return 0;
+
+        rc = 0;
+        if (msg->send_ack) {
+                ptl_hdr_t ack;
+
+                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+
+                memset (&ack, 0, sizeof (ack));
+                ack.type     = HTON__u32 (PTL_MSG_ACK);
+                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.src_nid  = HTON__u64 (nal->ni.nid);
+                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.src_pid  = HTON__u32 (nal->ni.pid);
+                PTL_HDR_LENGTH(&ack) = 0;
+
+                ack.msg.ack.dst_wmd = msg->ack_wmd;
+                ack.msg.ack.match_bits = msg->ev.match_bits;
+                ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
+
+                rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
+                               msg->nid, msg->pid, NULL, 0, 0);
+        }
+
+        md = msg->md;
+        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
+        eq = md->eq;
+
+        state_lock(nal, &flags);
+
+        if (eq != NULL) {
+                ptl_event_t  *ev = &msg->ev;
+                ptl_event_t  *eq_slot;
+
+                /* I have to hold the lock while I bump the sequence number
+                 * and copy the event into the queue.  If not, and I was
+                 * interrupted after bumping the sequence number, other
+                 * events could fill the queue, including the slot I just
+                 * allocated to this event.  On resuming, I would overwrite
+                 * a more 'recent' event with old event state, and
+                 * processes taking events off the queue would not detect
+                 * overflow correctly.
+                 */
+
+                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
+
+                /* size must be a power of 2 to handle a wrapped sequence # */
+                LASSERT (eq->size != 0 &&
+                         eq->size == LOWEST_BIT_SET (eq->size));
+                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+                /* Invalidate unlinked_me unless this is the last
+                 * event for an auto-unlinked MD.  Note that if md was
+                 * auto-unlinked, md->pending can only decrease
+                 */
+                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
+                    md->pending != 1)                       /* not last ref */
+                        ev->unlinked_me = PTL_HANDLE_NONE;
+
+                /* Copy the event into the allocated slot, ensuring all the
+                 * rest of the event's contents have been copied _before_
+                 * the sequence number gets updated.  A processes 'getting'
+                 * an event waits on the next queue slot's sequence to be
+                 * 'new'.  When it is, _all_ other event fields had better
+                 * be consistent.  I assert 'sequence' is the last member,
+                 * so I only need a 2 stage copy.
+                 */
+                LASSERT(sizeof (ptl_event_t) ==
+                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                                    offsetof (ptl_event_t, sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+                /* Updating the sequence number is what makes the event 'new' */
+
+                /* cb_write is not necessarily atomic, so this could
+                   cause a race with PtlEQGet */
+                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                                   (void *)&ev->sequence,sizeof (ev->sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+
+                /* I must also ensure that (a) callbacks are made in the
+                 * same order as the events land in the queue, and (b) the
+                 * callback occurs before the event can be removed from the
+                 * queue, so I can't drop the lock during the callback. */
+                if (nal->cb_callback != NULL)
+                        nal->cb_callback(nal, private, eq, ev);
+                else  if (eq->event_callback != NULL)
+                        (void)((eq->event_callback) (ev));
+        }
+
+        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
+                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+
+        md->pending--;
+        if (md->pending == 0 && /* no more outstanding operations on this md */
+            (md->threshold == 0 ||              /* done its business */
+             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+                lib_md_unlink(nal, md);
+
+        list_del (&msg->msg_list);
+        nal->ni.counters.msgs_alloc--;
+        lib_msg_free(nal, msg);
+
+        state_unlock(nal, &flags);
+
+        return rc;
+}
diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c
new file mode 100644 (file)
index 0000000..37dcb91
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-ni.c
+ * Network status registers and distance functions.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+#define MAX_DIST 18446744073709551615UL
+
+int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlNIDebug_in *args = v_args;
+        PtlNIDebug_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->rc = ni->debug;
+        ni->debug = args->mask_in;
+
+        return 0;
+}
+
+int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_sr_index_t register_in
+         *
+         * Outgoing:
+         *      ptl_sr_value_t          * status_out
+         */
+
+        PtlNIStatus_in *args = v_args;
+        PtlNIStatus_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_counters_t *count = &ni->counters;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        ret->rc = PTL_OK;
+        ret->status_out = 0;
+
+        /*
+         * I hate this sort of code....  Hash tables, offset lists?
+         * Treat the counters as an array of ints?
+         */
+        if (args->register_in == PTL_SR_DROP_COUNT)
+                ret->status_out = count->drop_count;
+
+        else if (args->register_in == PTL_SR_DROP_LENGTH)
+                ret->status_out = count->drop_length;
+
+        else if (args->register_in == PTL_SR_RECV_COUNT)
+                ret->status_out = count->recv_count;
+
+        else if (args->register_in == PTL_SR_RECV_LENGTH)
+                ret->status_out = count->recv_length;
+
+        else if (args->register_in == PTL_SR_SEND_COUNT)
+                ret->status_out = count->send_count;
+
+        else if (args->register_in == PTL_SR_SEND_LENGTH)
+                ret->status_out = count->send_length;
+
+        else if (args->register_in == PTL_SR_MSGS_MAX)
+                ret->status_out = count->msgs_max;
+        else
+                ret->rc = PTL_INV_SR_INDX;
+
+        return ret->rc;
+}
+
+
+int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_process_id_t process_in
+
+         *
+         * Outgoing:
+         *      unsigned long   * distance_out
+
+         */
+
+        PtlNIDist_in *args = v_args;
+        PtlNIDist_out *ret = v_ret;
+
+        unsigned long dist;
+        ptl_process_id_t id_in = args->process_in;
+        ptl_nid_t nid;
+        int rc;
+
+        nid = id_in.nid;
+
+        if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) {
+                ret->distance_out = (unsigned long) MAX_DIST;
+                return PTL_INV_PROC;
+        }
+
+        ret->distance_out = dist;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lnet/lnet/lib-not-impl.c b/lnet/lnet/lib-not-impl.c
new file mode 100644 (file)
index 0000000..78959b2
--- /dev/null
@@ -0,0 +1,37 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-not-impl.c
+ *
+ * boiler plate functions that can be used to write the 
+ * library side routines
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+
+int do_PtlACEntry(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_ac_index_t index_in
+         *      ptl_process_id_t match_id_in
+         *      ptl_pt_index_t portal_in
+
+         *
+         * Outgoing:
+
+         */
+
+        PtlACEntry_in *args = v_args;
+        PtlACEntry_out *ret = v_ret;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        return ret->rc = PTL_NOT_IMPLEMENTED;
+}
diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c
new file mode 100644 (file)
index 0000000..e00e9f0
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-pid.c
+ * Process identification routines
+ */
+
+/* This should be removed.  The NAL should have the PID information */
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#if defined (__KERNEL__)
+#       include <linux/kernel.h>
+extern int getpid(void);
+#else
+#       include <stdio.h>
+#       include <unistd.h>
+#endif
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t handle_in
+         *
+         * Outgoing:
+         *      ptl_process_id_t        * id_out
+         *      ptl_id_t                * gsize_out
+         */
+
+        PtlGetId_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->id_out.nid = ni->nid;
+        ret->id_out.pid = ni->pid;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lnet/packaging/.cvsignore b/lnet/packaging/.cvsignore
new file mode 100644 (file)
index 0000000..fd1d56a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+aclocal.m4
+config.log
+config.status
+config.cache
+configure
+portals.spec
diff --git a/lnet/packaging/Makefile.am b/lnet/packaging/Makefile.am
new file mode 100644 (file)
index 0000000..126bc69
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = portals.spec
\ No newline at end of file
diff --git a/lnet/packaging/portals.spec.in b/lnet/packaging/portals.spec.in
new file mode 100644 (file)
index 0000000..e196b3f
--- /dev/null
@@ -0,0 +1,116 @@
+%define kversion @RELEASE@
+%define linuxdir @LINUX@
+%define version HEAD
+
+Summary: Sandia Portals Message Passing - utilities 
+Name: portals
+Version: %{version}
+Release: 0210101748uml
+Copyright: LGPL
+Group: Utilities/System
+BuildRoot: /var/tmp/portals-%{version}-root
+Source: http://sandiaportals.org/portals-%{version}.tar.gz
+
+%description
+Sandia Portals message passing package.  Contains kernel modules, libraries and utilities. 
+
+%package -n portals-modules
+Summary: Kernel modules and NAL's for portals
+Group: Development/Kernel
+
+%description -n portals-modules
+Object-Based Disk storage drivers for Linux %{kversion}.
+
+%package -n portals-source
+Summary: Portals kernel source for rebuilding with other kernels
+Group: Development/Kernel
+
+%description -n portals-source
+Portals kernel source for rebuilding with other kernels
+
+%prep
+%setup -n portals-%{version}
+
+%build
+rm -rf $RPM_BUILD_ROOT
+
+# Create the pristine source directory.
+srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version}
+mkdir -p $srcdir
+find . -name CVS -prune -o -print | cpio -ap $srcdir
+
+# Set an explicit path to our Linux tree, if we can.
+conf_flag=
+linuxdir=%{linuxdir}
+test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+./configure $conf_flag
+make 
+
+%install
+make install prefix=$RPM_BUILD_ROOT
+
+%ifarch alpha
+# this hurts me
+  conf_flag=
+  linuxdir=%{linuxdir}
+  test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+  make clean
+  ./configure --enable-rtscts-myrinet $conf_flag
+  make
+  cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o
+  cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload
+%endif
+
+
+%files
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /usr/sbin/acceptor
+%attr(-, root, root) /usr/sbin/ptlctl
+%attr(-, root, root) /usr/sbin/debugctl
+%ifarch alpha
+%attr(-, root, root) /usr/sbin/mcpload
+%endif
+%attr(-, root, root) /lib/libmyrnal.a
+%attr(-, root, root) /lib/libptlapi.a
+%attr(-, root, root) /lib/libptlctl.a
+%attr(-, root, root) /lib/libprocbridge.a
+%attr(-, root, root) /lib/libptllib.a
+%attr(-, root, root) /lib/libtcpnal.a 
+%attr(-, root, root) /lib/libtcpnalutil.a
+%attr(-, root, root) /usr/include/portals/*.h
+%attr(-, root, root) /usr/include/portals/base/*.h
+%attr(-, root, root) /usr/include/linux/*.h
+
+%files -n portals-modules
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o
+%ifarch alpha
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o
+%endif
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o
+
+%files -n portals-source
+%attr(-, root, root) /usr/src/portals-%{version}
+
+%post
+if [ ! -e /dev/portals ]; then
+   mknod /dev/portals c 10 240
+fi
+depmod -ae || exit 0
+
+grep -q portals /etc/modules.conf || \
+       echo 'alias char-major-10-240 portals' >> /etc/modules.conf
+
+grep -q '/dev/portals' /etc/modules.conf || \
+       echo 'alias /dev/portals portals' >> /etc/modules.conf
+
+%postun
+depmod -ae || exit 0
+
+%clean
+#rm -rf $RPM_BUILD_ROOT
+
+# end of file
diff --git a/lnet/router/Makefile.am b/lnet/router/Makefile.am
new file mode 100644 (file)
index 0000000..1c8087b
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+MODULE = kptlrouter
+modulenet_DATA = kptlrouter.o
+EXTRA_PROGRAMS = kptlrouter
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kptlrouter_SOURCES = router.c proc.c router.h
diff --git a/lnet/router/Makefile.mk b/lnet/router/Makefile.mk
new file mode 100644 (file)
index 0000000..64bd09b
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += kptlrouter.o
+kptlrouter-objs    := router.o proc.o
diff --git a/lnet/router/proc.c b/lnet/router/proc.c
new file mode 100644 (file)
index 0000000..dd65b34
--- /dev/null
@@ -0,0 +1,78 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+#define KPR_PROC_ROUTER "sys/portals/router"
+
+int
+kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+       unsigned long long bytes = kpr_fwd_bytes;
+       unsigned long      packets = kpr_fwd_packets;
+       unsigned long      errors = kpr_fwd_errors;
+        unsigned int       qdepth = atomic_read (&kpr_queue_depth);
+       int                len;
+       
+       *eof = 1;
+       if (off != 0)
+               return (0);
+       
+       len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth);
+       
+       *start = page;
+       return (len);
+}
+
+int
+kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data)
+{
+       /* Ignore what we've been asked to write, and just zero the stats counters */
+       kpr_fwd_bytes = 0;
+       kpr_fwd_packets = 0;
+       kpr_fwd_errors = 0;
+
+       return (count);
+}
+
+void
+kpr_proc_init(void)
+{
+        struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL);
+
+        if (entry == NULL) 
+       {
+                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER);
+                return;
+        }
+
+        entry->data = NULL;
+        entry->read_proc = kpr_proc_read;
+       entry->write_proc = kpr_proc_write;
+}
+
+void 
+kpr_proc_fini(void)
+{
+        remove_proc_entry(KPR_PROC_ROUTER, 0);
+}
diff --git a/lnet/router/router.c b/lnet/router/router.c
new file mode 100644 (file)
index 0000000..8a1de08
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+struct list_head kpr_routes;
+struct list_head kpr_nals;
+
+unsigned long long kpr_fwd_bytes;
+unsigned long      kpr_fwd_packets;
+unsigned long      kpr_fwd_errors;
+atomic_t           kpr_queue_depth;
+
+/* Mostly the tables are read-only (thread and interrupt context)
+ *
+ * Once in a blue moon we register/deregister NALs and add/remove routing
+ * entries (thread context only)... */
+rwlock_t         kpr_rwlock;
+
+kpr_router_interface_t kpr_router_interface = {
+       kprri_register:         kpr_register_nal,
+       kprri_lookup:           kpr_lookup_target,
+       kprri_fwd_start:        kpr_forward_packet,
+       kprri_fwd_done:         kpr_complete_packet,
+       kprri_shutdown:         kpr_shutdown_nal,
+       kprri_deregister:       kpr_deregister_nal,
+};
+
+kpr_control_interface_t kpr_control_interface = {
+       kprci_add_route:        kpr_add_route,
+       kprci_del_route:        kpr_del_route,
+       kprci_get_route:        kpr_get_route,
+};
+
+int
+kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_nal_entry_t   *ne;
+
+        CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+
+       PORTAL_ALLOC (ne, sizeof (*ne));
+       if (ne == NULL)
+               return (-ENOMEM);
+
+       memset (ne, 0, sizeof (*ne));
+        memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
+
+       LASSERT (!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+       {
+               kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+               if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
+               {
+                       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                       CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid);
+
+                       PORTAL_FREE (ne, sizeof (*ne));
+                       return (-EEXIST);
+               }
+       }
+
+        list_add (&ne->kpne_list, &kpr_nals);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       *argp = ne;
+       PORTAL_MODULE_USE;
+        return (0);
+}
+
+void
+kpr_shutdown_nal (void *arg)
+{
+       long             flags;
+       kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (!ne->kpne_shutdown);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
+       ne->kpne_shutdown = 1;
+       write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
+
+       while (atomic_read (&ne->kpne_refcount) != 0)
+       {
+               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
+                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
+
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+}
+
+void
+kpr_deregister_nal (void *arg)
+{
+       long              flags;
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
+       LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       list_del (&ne->kpne_list);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       PORTAL_FREE (ne, sizeof (*ne));
+        PORTAL_MODULE_UNUSE;
+}
+
+
+int
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+{
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+       struct list_head *e;
+       int               rc = -ENOENT;
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+
+       if (ne->kpne_shutdown)          /* caller is shutting down */
+               return (-ENOENT);
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid on the callers network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid ||
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+               /* found table entry */
+
+               if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
+                       rc = -EHOSTUNREACH;
+               else
+               {
+                       rc = 0;
+                       *gateway_nidp = re->kpre_gateway_nid;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+                target_nid, ne->kpne_interface.kprni_nalid, rc,
+                (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
+       return (rc);
+}
+
+void
+kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+       kpr_nal_entry_t  *src_ne = (kpr_nal_entry_t *)arg;
+       ptl_nid_t         target_nid = fwd->kprfd_target_nid;
+        int               nob = fwd->kprfd_nob;
+       struct list_head *e;
+
+        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+        LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
+        LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov));
+        
+        atomic_inc (&kpr_queue_depth);
+
+        kpr_fwd_packets++;                   /* (loose) stats accounting */
+        kpr_fwd_bytes += nob;
+
+       if (src_ne->kpne_shutdown)                      /* caller is shutting down */
+               goto out;
+
+       fwd->kprfd_router_arg = src_ne;         /* stash caller's nal entry */
+       atomic_inc (&src_ne->kpne_refcount);    /* source nal is busy until fwd completes */
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid || /* no match */
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+                CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
+                        target_nid, src_ne->kpne_interface.kprni_nalid,
+                        re->kpre_gateway_nid, re->kpre_gateway_nalid);
+
+               if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
+                       break;                  /* don't route to same NAL */
+
+               /* Search for gateway's NAL's entry */
+
+               for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+               {
+                       kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+                       if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
+                               continue;
+
+                       if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
+                               break;
+
+                       fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
+                       atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+
+                       read_unlock (&kpr_rwlock);
+
+                        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
+                                target_nid, src_ne->kpne_interface.kprni_nalid,
+                                fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+
+                       dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+                       return;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+ out:
+        kpr_fwd_errors++;
+
+        CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+       /* Can't find anywhere to forward to */
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);
+}
+
+void
+kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
+{
+       kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
+       kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
+
+        CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
+
+       atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
+
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
+
+        CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, error);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
+}
+
+int
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+               ptl_nid_t hi_nid)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_route_entry_t *re;
+
+        CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+               gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        LASSERT(lo_nid <= hi_nid);
+
+        PORTAL_ALLOC (re, sizeof (*re));
+        if (re == NULL)
+                return (-ENOMEM);
+
+        re->kpre_gateway_nalid = gateway_nalid;
+        re->kpre_gateway_nid = gateway_nid;
+        re->kpre_lo_nid = lo_nid;
+        re->kpre_hi_nid = hi_nid;
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
+                                                    kpre_list);
+
+                if (re->kpre_lo_nid > re2->kpre_hi_nid ||
+                    re->kpre_hi_nid < re2->kpre_lo_nid)
+                        continue;
+
+                CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
+                        "to ["LPX64" - "LPX64"]\n",
+                        re->kpre_lo_nid, re->kpre_hi_nid,
+                        re2->kpre_lo_nid, re2->kpre_hi_nid);
+
+                write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                PORTAL_FREE (re, sizeof (*re));
+                return (-EINVAL);
+        }
+
+        list_add (&re->kpre_list, &kpr_routes);
+
+        write_unlock_irqrestore (&kpr_rwlock, flags);
+        return (0);
+}
+
+int
+kpr_del_route (ptl_nid_t nid)
+{
+       long               flags;
+       struct list_head  *e;
+
+        CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave(&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+                        continue;
+
+                list_del (&re->kpre_list);
+                write_unlock_irqrestore(&kpr_rwlock, flags);
+
+                PORTAL_FREE(re, sizeof (*re));
+                return (0);
+        }
+
+        write_unlock_irqrestore(&kpr_rwlock, flags);
+        return (-ENOENT);
+}
+
+int
+kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+              ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+{
+       struct list_head  *e;
+
+       read_lock(&kpr_rwlock);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (idx-- == 0) {
+                        *gateway_nalid = re->kpre_gateway_nalid;
+                        *gateway_nid = re->kpre_gateway_nid;
+                        *lo_nid = re->kpre_lo_nid;
+                        *hi_nid = re->kpre_hi_nid;
+
+                        read_unlock(&kpr_rwlock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kpr_rwlock);
+        return (-ENOENT);
+}
+
+static void __exit
+kpr_finalise (void)
+{
+        LASSERT (list_empty (&kpr_nals));
+
+        while (!list_empty (&kpr_routes)) {
+                kpr_route_entry_t *re = list_entry(kpr_routes.next,
+                                                   kpr_route_entry_t,
+                                                   kpre_list);
+
+                list_del(&re->kpre_list);
+                PORTAL_FREE(re, sizeof (*re));
+        }
+
+        kpr_proc_fini();
+
+        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
+
+        CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
+               atomic_read(&portal_kmemory));
+}
+
+static int __init
+kpr_initialise (void)
+{
+        CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+       rwlock_init(&kpr_rwlock);
+       INIT_LIST_HEAD(&kpr_routes);
+       INIT_LIST_HEAD(&kpr_nals);
+
+        kpr_proc_init();
+
+        PORTAL_SYMBOL_REGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_REGISTER(kpr_control_interface);
+        return (0);
+}
+
+MODULE_AUTHOR("Eric Barton");
+MODULE_DESCRIPTION("Kernel Portals Router v0.01");
+MODULE_LICENSE("GPL");
+
+module_init (kpr_initialise);
+module_exit (kpr_finalise);
+
+EXPORT_SYMBOL (kpr_control_interface);
+EXPORT_SYMBOL (kpr_router_interface);
diff --git a/lnet/router/router.h b/lnet/router/router.h
new file mode 100644 (file)
index 0000000..b8c3bec
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _KPTLROUTER_H
+#define _KPTLROUTER_H
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#define DEBUG_SUBSYSTEM S_PTLROUTER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+typedef struct
+{
+       struct list_head        kpne_list;
+       kpr_nal_interface_t     kpne_interface;
+       atomic_t                kpne_refcount;
+       int                     kpne_shutdown;
+} kpr_nal_entry_t;
+
+typedef struct
+{
+       struct list_head        kpre_list;
+       int                     kpre_gateway_nalid;
+       ptl_nid_t               kpre_gateway_nid;
+       ptl_nid_t               kpre_lo_nid;
+        ptl_nid_t               kpre_hi_nid;
+} kpr_route_entry_t;
+
+extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_shutdown_nal (void *arg);
+extern void kpr_deregister_nal (void *arg);
+
+extern void kpr_proc_init (void);
+extern void kpr_proc_fini (void);
+
+extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, 
+                          ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, 
+                          ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+
+extern unsigned long long kpr_fwd_bytes;
+extern unsigned long      kpr_fwd_packets;
+extern unsigned long      kpr_fwd_errors;
+extern atomic_t           kpr_queue_depth;
+
+#endif /* _KPLROUTER_H */
diff --git a/lnet/tests/.cvsignore b/lnet/tests/.cvsignore
new file mode 100644 (file)
index 0000000..051d1bd
--- /dev/null
@@ -0,0 +1,3 @@
+Makefile
+Makefile.in
+.deps
diff --git a/lnet/tests/Makefile.am b/lnet/tests/Makefile.am
new file mode 100644 (file)
index 0000000..7b47ae0
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r
+LINK = $(LD) $(LDFLAGS) -o $@
+DEFS =
+LIBS =
+MODULE = $(basename)
+EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh
+
+noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o 
+
+pingsrv_o_SOURCES = ping_srv.c ping.h
+
+pingcli_o_SOURCES = ping_cli.c ping.h
+
+spingsrv_o_SOURCES = sping_srv.c ping.h
+
+spingcli_o_SOURCES = sping_cli.c ping.h
diff --git a/lnet/tests/ping.h b/lnet/tests/ping.h
new file mode 100644 (file)
index 0000000..f07444b
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _KPING_INCLUDED
+#define _KPING_INCLUDED
+
+#include <portals/p30.h>
+
+
+#define PTL_PING_IN_SIZE               256     // n packets per buffer
+#define PTL_PING_IN_BUFFERS            2       // n fallback buffers
+
+#define PTL_PING_CLIENT                        4
+#define PTL_PING_SERVER                        5
+
+#define PING_HEADER_MAGIC              0xDEADBEEF
+#define PING_BULK_MAGIC                        0xCAFEBABE
+
+#define PING_HEAD_BITS                 0x00000001
+#define PING_BULK_BITS                 0x00000002
+#define PING_IGNORE_BITS               0xFFFFFFFC
+
+#define PTL_PING_ACK                   0x01
+#define PTL_PING_VERBOSE               0x02
+#define PTL_PING_VERIFY                        0x04
+#define PTL_PING_PREALLOC              0x08
+
+
+#define NEXT_PRIMARY_BUFFER(index)             \
+       (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1))
+
+#define PDEBUG(str, err)                       \
+       CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err)
+
+
+/* Ping data to be passed via the ioctl to kernel space */
+
+#if __KERNEL__
+
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+struct pingsrv_data {
+        
+        ptl_handle_ni_t         ni;
+        ptl_handle_me_t         me;
+        ptl_handle_eq_t         eq;
+        void                   *in_buf;
+        ptl_process_id_t        my_id;
+        ptl_process_id_t        id_local;
+        ptl_md_t                mdin;
+        ptl_md_t                mdout;
+        ptl_handle_md_t         mdin_h;
+        ptl_handle_md_t         mdout_h;
+        ptl_event_t             evnt;
+        struct task_struct     *tsk;
+}; /* struct pingsrv_data */
+struct pingcli_data {
+        
+        struct portal_ioctl_data *args;
+        ptl_handle_me_t        me;
+        ptl_handle_eq_t                eq;
+        char                          *inbuf;    
+        char                   *outbuf;   
+        ptl_process_id_t       myid; 
+        ptl_process_id_t       id_local; 
+        ptl_process_id_t       id_remote;
+        ptl_md_t               md_in_head;
+        ptl_md_t               md_out_head;
+        ptl_handle_md_t        md_in_head_h;
+        ptl_handle_md_t        md_out_head_h;
+        ptl_event_t            ev;
+        struct task_struct     *tsk;
+}; /* struct pingcli_data */
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _KPING_INCLUDED */
diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c
new file mode 100644 (file)
index 0000000..389ffbb
--- /dev/null
@@ -0,0 +1,300 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+
+#define MAX_TIME 100000
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        if ((rc = PtlMDUnlink (client->md_in_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+        int i, magic;
+        i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned));
+        magic = *(int *)(ev->mem_desc.start + ev->offset);
+
+        if(magic != 0xcafebabe) {
+                printk ("Unexpected response \n");
+                return 1;
+        }
+
+        if((i == count) || !count)
+                wake_up_process (client->tsk);
+        else
+                printk ("Received response after timeout for %d\n",i);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        unsigned ping_bulk_magic = PING_BULK_MAGIC;
+        int rc;
+        struct timeval tv1, tv2;
+        client->tsk = current;
+        client->args = args;
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,
+                        (args->ioc_size + STDSIZE) * args->ioc_count);
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        count = args->ioc_count;
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = (args->ioc_size + STDSIZE)
+                                                * count;
+        client->md_in_head.threshold = PTL_MD_THRESH_INF;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE + args->ioc_size;
+        client->md_out_head.threshold = args->ioc_count;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic));
+
+        count = 0;
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+        while ((args->ioc_count - count)) {
+                memcpy (client->outbuf + sizeof(unsigned),
+                       &(count), sizeof(unsigned));
+                 /* Put the ping packet */
+                do_gettimeofday (&tv1);
+
+                memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1,
+                       sizeof(struct timeval));
+
+                if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                         PDEBUG ("PtlPut (header)", rc);
+                         pingcli_shutdown (1);
+                         return NULL;
+                }
+                printk ("sent msg no %d", count);
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                rc = schedule_timeout (20 * args->ioc_timeout);
+                if (rc == 0) {
+                        printk ("   ::  timeout .....\n");
+                } else {
+                        do_gettimeofday (&tv2);
+                        printk("   ::  Reply in %u usec\n",
+                                (unsigned)((tv2.tv_sec - tv1.tv_sec)
+                                 * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
+                }
+                count++;
+        }
+
+        if (client->outbuf != NULL)
+                PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size);
+
+        if (client->inbuf != NULL)
+                PORTAL_FREE (client->inbuf,
+                               (args->ioc_size + STDSIZE) * args->ioc_count);
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        memset (client, 0, sizeof(struct pingcli_data));
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c
new file mode 100644 (file)
index 0000000..1037d09
--- /dev/null
@@ -0,0 +1,308 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+#define MAXSIZE (16*1024*1024)
+
+static unsigned ping_head_magic;
+static unsigned ping_bulk_magic;
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                case 5:
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, MAXSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        unsigned long magic;
+        unsigned long ping_bulk_magic = 0xcafebabe;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk =  current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+               
+                magic =  *((int *)(server->evnt.mem_desc.start 
+                                        + server->evnt.offset));
+                
+                
+                if(magic != 0xdeadbeef) {
+                        printk("Unexpected Packet to the server\n");
+                        
+                } 
+                memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
+                                
+                server->mdout.length    = server->evnt.rlength;
+                server->mdout.start     = server->in_buf;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = MAXSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)),
+               *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))),
+               *((int *)(ev->mem_desc.start + ev->offset + 2 * 
+                               sizeof(unsigned))));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "NAL %d not loaded\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, MAXSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = MAXSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        ping_head_magic = PING_HEADER_MAGIC;
+        ping_bulk_magic = PING_BULK_MAGIC;
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c
new file mode 100644 (file)
index 0000000..4cef08b
--- /dev/null
@@ -0,0 +1,276 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't
+ * send multiple packets in a single ioctl.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes
+                                                   assumed */
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+                        if (client->outbuf != NULL)
+                                PORTAL_FREE (client->outbuf, STDSIZE);
+
+                        if (client->inbuf != NULL)
+                                PORTAL_FREE (client->inbuf, STDSIZE);
+
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+                wake_up_process (client->tsk);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        const ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        int rc;
+
+        client->tsk = current;
+        client->args = args;
+
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,  STDSIZE);
+
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded.\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = STDSIZE;
+        client->md_in_head.threshold = 1;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, STDSIZE);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE;
+        client->md_out_head.threshold = 1;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Put the ping packet */
+        if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                         client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                PDEBUG ("PtlPut (header)", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+
+        count = 0;
+        set_current_state (TASK_INTERRUPTIBLE);
+        rc = schedule_timeout (20 * args->ioc_timeout);
+        if (rc == 0) {
+                printk (" Time out on the server\n");
+                pingcli_shutdown (2);
+                return NULL;
+        } else
+                printk("Received respose from the server \n");
+
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        memset (client, 0, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c
new file mode 100644 (file)
index 0000000..a18ea35
--- /dev/null
@@ -0,0 +1,295 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't 
+ * send multiple packets in a single ioctl.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4)
+
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#endif
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, STDSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk = current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+                               
+                server->mdout.start     = server->in_buf;
+                server->mdout.length    = STDSIZE;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = STDSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, STDSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = STDSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh
new file mode 100644 (file)
index 0000000..c9b7c16
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingcli.o
+else
+       PING=spingcli.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING 
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+exit 0;
diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh
new file mode 100644 (file)
index 0000000..942300e
--- /dev/null
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingsrv.o
+else
+       PING=spingsrv.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING nal=4
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING nal=2
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING nal=4
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+../utils/acceptor 9999&
+exit 0;
diff --git a/lnet/tests/stopclient.sh b/lnet/tests/stopclient.sh
new file mode 100644 (file)
index 0000000..f7e3aa1
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingcli
+else
+       PING=pingcli
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+rmmod portals
diff --git a/lnet/tests/stopserver.sh b/lnet/tests/stopserver.sh
new file mode 100644 (file)
index 0000000..3e81831
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingsrv
+else
+       PING=pingsrv
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+killall -9 acceptor
+rm -f /var/run/acceptor-9999.pid
+rmmod portals
diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am
new file mode 100644 (file)
index 0000000..b62b401
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lnet/ulnds/README b/lnet/ulnds/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c
new file mode 100644 (file)
index 0000000..89c9f78
--- /dev/null
@@ -0,0 +1,293 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(connection c)
+{
+    return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h
new file mode 100644 (file)
index 0000000..f6b2994
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m,
+                                unsigned int ip,  
+                               unsigned int short);
+manager init_connections(unsigned short,
+                         int (*f)(void *,connection),
+                         void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len);
diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am
new file mode 100644 (file)
index 0000000..b62b401
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lnet/ulnds/socklnd/README b/lnet/ulnds/socklnd/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c
new file mode 100644 (file)
index 0000000..89c9f78
--- /dev/null
@@ -0,0 +1,293 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(connection c)
+{
+    return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h
new file mode 100644 (file)
index 0000000..f6b2994
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m,
+                                unsigned int ip,  
+                               unsigned int short);
+manager init_connections(unsigned short,
+                         int (*f)(void *,connection),
+                         void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len);
diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lnet/ulnds/socklnd/pqtimer.c b/lnet/ulnds/socklnd/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lnet/ulnds/socklnd/pqtimer.h b/lnet/ulnds/socklnd/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lnet/ulnds/socklnd/table.c b/lnet/ulnds/socklnd/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lnet/ulnds/socklnd/table.h b/lnet/ulnds/socklnd/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c
new file mode 100644 (file)
index 0000000..8bf55c4
--- /dev/null
@@ -0,0 +1,196 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a,connection c)
+{
+    bridge b=a;
+    ptl_hdr_t hdr;
+    if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+        lib_parse(b->nal_cb, &hdr, c);
+        return(1);
+    }
+    return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lnet/ulnds/socklnd/timer.h b/lnet/ulnds/socklnd/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c
new file mode 100644 (file)
index 0000000..8bf55c4
--- /dev/null
@@ -0,0 +1,196 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a,connection c)
+{
+    bridge b=a;
+    ptl_hdr_t hdr;
+    if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+        lib_parse(b->nal_cb, &hdr, c);
+        return(1);
+    }
+    return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore
new file mode 100644 (file)
index 0000000..041cd6b
--- /dev/null
@@ -0,0 +1,7 @@
+Makefile
+Makefile.in
+acceptor
+debugctl
+ptlctl
+.deps
+routerstat
diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am
new file mode 100644 (file)
index 0000000..065fcf9
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+COMPILE = gcc -Wall -g -I$(srcdir)/../include 
+LINK = gcc -o $@
+
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat
+lib_LIBRARIES = libptlctl.a
+
+acceptor_SOURCES = acceptor.c # -lefence
+
+libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+
+ptlctl_SOURCES = ptlctl.c
+ptlctl_LDADD =  -L. -lptlctl -lncurses # -lefence
+ptlctl_DEPENDENCIES = libptlctl.a
+
+debugctl_SOURCES = debugctl.c
+debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
+debugctl_DEPENDENCIES = libptlctl.a
+
+routerstat_SOURCES = routerstat.c
diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c
new file mode 100644 (file)
index 0000000..c6590db
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <asm/byteorder.h>
+#include <syslog.h>
+
+#include <errno.h>
+
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+/* should get this from autoconf somehow */
+#ifndef PIDFILE_DIR
+#define PIDFILE_DIR "/var/run"
+#endif 
+
+#define PROGNAME "acceptor"
+
+void create_pidfile(char *name, int port)
+{
+        char pidfile[1024];
+        FILE *fp;
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if ((fp = fopen(pidfile, "w"))) {
+                fprintf(fp, "%d\n", getpid());
+                fclose(fp);
+        } else {
+                syslog(LOG_ERR, "%s: %s\n", pidfile, 
+                       strerror(errno));
+        }
+}
+
+int pidfile_exists(char *name, int port)
+{
+        char pidfile[1024];
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if (!access(pidfile, F_OK)) {
+                fprintf(stderr, "%s: exists, acceptor already running.\n", 
+                        pidfile);
+                return (1);
+        } 
+        return (0);
+}
+
+int
+parse_size (int *sizep, char *str)
+{
+        int             size;
+        char            mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod))
+        {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod)
+                {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+void
+show_connection (int fd, __u32 net_ip, ptl_nid_t nid)
+{
+        struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET);
+        __u32 host_ip = ntohl (net_ip);
+        int  rxmem = 0;
+        int  txmem = 0;
+        int  nonagle = 0;
+        int  len;
+        char host[1024];
+        
+        len = sizeof (txmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0)
+                perror ("Cannot get write buffer size");
+        
+        len = sizeof (rxmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0)
+                perror ("Cannot get read buffer size");
+        
+        len = sizeof (nonagle);
+        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0)
+                perror ("Cannot get nagle");
+
+        if (h == NULL)
+                snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff,
+                                    (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff);
+        else
+                snprintf (host, sizeof(host), "%s", h->h_name);
+                
+        syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", 
+                 host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled");
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+void
+usage (char *myname)
+{
+        fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname);
+        exit (1);
+}
+
+int main(int argc, char **argv)
+{
+        int o, fd, rc, port, pfd;
+        struct sockaddr_in srvaddr;
+        int c;
+        int rxmem = 0;
+        int txmem = 0;
+        int noclose = 0;
+        int nonagle = 1;
+        int nal = SOCKNAL;
+        int xchg_nids = 0;
+        int bind_irq = 0;
+        
+        while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1)
+                switch (c)
+                {
+                case 'r':
+                        if (parse_size (&rxmem, optarg) != 0 || rxmem < 0)
+                                usage (argv[0]);
+                        break;
+                        
+                case 's':
+                        if (parse_size (&txmem, optarg) != 0 || txmem < 0)
+                                usage (argv[0]);
+                        break;
+
+                case 'n':
+                        nonagle = 0;
+                        break;
+
+                case 'l':
+                        noclose = 1;
+                        break;
+
+                case 'x':
+                        xchg_nids = 1;
+                        break;
+
+                case 'i':
+                        bind_irq = 1;
+                        break;
+                        
+                case 'N':
+                        if (parse_size(&nal, optarg) != 0 || 
+                            nal < 0 || nal > NAL_MAX_NR)
+                                usage(argv[0]);
+                        break;
+                        
+                default:
+                        usage (argv[0]);
+                        break;
+                }
+
+        if (optind >= argc)
+                usage (argv[0]);
+
+        port = atol(argv[optind++]);
+
+        if (pidfile_exists(PROGNAME, port))
+                exit(1);
+
+        memset(&srvaddr, 0, sizeof(srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(port);
+        srvaddr.sin_addr.s_addr = INADDR_ANY;
+
+        fd = socket(PF_INET, SOCK_STREAM, 0);
+        if (fd < 0) {
+                perror("opening socket");
+                exit(1);
+        }
+
+        o = 1;
+        if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) {
+                perror("Cannot set REUSEADDR socket opt");
+                exit(1);
+        }
+
+        if (nonagle)
+        {
+                o = 1;
+                rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
+                if (rc != 0) 
+                { 
+                        perror ("Cannot disable nagle");
+                        exit (1);
+                }
+        }
+
+        if (txmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set write buffer size");
+                        exit (1);
+                }
+        }
+        
+        if (rxmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set read buffer size");
+                        exit (1);
+               }
+        }
+                
+        rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+        if ( rc == -1 ) {
+                perror("bind: ");
+                exit(1);
+        }
+
+        if (listen(fd, 127)) {
+                perror("listen: ");
+                exit(1);
+        }
+        fprintf(stderr, "listening on port %d\n", port);
+
+        pfd = open("/dev/portals", O_RDWR);
+        if ( pfd < 0 ) {
+                perror("opening portals device");
+                exit(1);
+        }
+
+        rc = daemon(1, noclose);
+        if (rc < 0) {
+                perror("daemon(): ");
+                exit(1);
+        }
+
+        openlog(PROGNAME, LOG_PID, LOG_DAEMON);
+        syslog(LOG_INFO, "started, listening on port %d\n", port);
+        create_pidfile(PROGNAME, port);
+
+        while (1) {
+                struct sockaddr_in clntaddr;
+                int len = sizeof(clntaddr);
+                int cfd;
+                struct portal_ioctl_data data;
+                ptl_nid_t peer_nid;
+                
+                cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
+                if ( cfd < 0 ) {
+                        perror("accept");
+                        exit(0);
+                        continue;
+                }
+
+                if (!xchg_nids)
+                        peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */
+                else
+                {
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = nal;
+                        rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data);
+                        if (rc < 0)
+                        {
+                                perror ("Can't get my NID");
+                                close (cfd);
+                                continue;
+                        }
+                        
+                        rc = exchange_nids (cfd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (cfd);
+                                continue;
+                        }
+                }
+
+                show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid);
+                
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = cfd;
+                data.ioc_nal = nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
+                        perror("ioctl failed");
+
+                } else {
+                        printf("client registered\n");
+                }
+                rc = close(cfd);
+                if (rc)
+                        perror ("close failed");
+        }
+
+        closelog();
+        exit(0);
+
+}
diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c
new file mode 100644 (file)
index 0000000..13572dc
--- /dev/null
@@ -0,0 +1,620 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <syscall.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#define BUG()                            /* workaround for module.h includes */
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/module.h>
+#endif
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+static char rawbuf[8192];
+static char *buf = rawbuf;
+static int max = 8192;
+//static int g_pfd = -1;
+static int subsystem_array[1 << 8];
+static int debug_mask = ~0;
+
+static const char *portal_debug_subsystems[] =
+        {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite",
+         "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter",
+         "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL};
+static const char *portal_debug_masks[] =
+        {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
+         "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
+         "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL};
+
+struct debug_daemon_cmd {
+        char *cmd;
+        unsigned int cmdv;
+};
+
+static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = {
+        {"start", DEBUG_DAEMON_START},
+        {"stop", DEBUG_DAEMON_STOP},
+        {"pause", DEBUG_DAEMON_PAUSE},
+        {"continue", DEBUG_DAEMON_CONTINUE},
+        {0, 0}
+};
+
+static int do_debug_mask(char *name, int enable)
+{
+        int found = 0, i;
+
+        for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_subsystems[i]) == 0 ||
+                    strcasecmp(name, "all_subs") == 0) {
+                        printf("%s output from subsystem \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_subsystems[i]);
+                        subsystem_array[i] = enable;
+                        found = 1;
+                }
+        }
+        for (i = 0; portal_debug_masks[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_masks[i]) == 0 ||
+                    strcasecmp(name, "all_types") == 0) {
+                        printf("%s output of type \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_masks[i]);
+                        if (enable)
+                                debug_mask |= (1 << i);
+                        else
+                                debug_mask &= ~(1 << i);
+                        found = 1;
+                }
+        }
+
+        return found;
+}
+
+int dbg_initialize(int argc, char **argv)
+{
+        memset(subsystem_array, 1, sizeof(subsystem_array));
+        return 0;
+}
+
+int jt_dbg_filter(int argc, char **argv)
+{
+        int   i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 0))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+        return 0;
+}
+
+int jt_dbg_show(int argc, char **argv)
+{
+        int    i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 1))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+
+        return 0;
+}
+
+static int applymask(char* procpath, int value)
+{
+        int rc;
+        char buf[64];
+        int len = snprintf(buf, 64, "%d", value);
+
+        int fd = open(procpath, O_WRONLY);
+        if (fd == -1) {
+                fprintf(stderr, "Unable to open %s: %s\n",
+                        procpath, strerror(errno));
+                return fd;
+        }
+        rc = write(fd, buf, len+1);
+        if (rc<0) {
+                fprintf(stderr, "Write to %s failed: %s\n",
+                        procpath, strerror(errno));
+                return rc;
+        }
+        close(fd);
+        return 0;
+}
+
+extern char *dump_filename;
+extern int dump(int dev_id, int opc, void *buf);
+
+static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
+{
+        if (!dump_filename) {
+                applymask("/proc/sys/portals/subsystem_debug", subs_mask);
+                applymask("/proc/sys/portals/debug", debug_mask);
+        } else {
+                struct portals_debug_ioctl_data data;
+
+                data.hdr.ioc_len = sizeof(data);
+                data.hdr.ioc_version = 0;
+                data.subs = subs_mask;
+                data.debug = debug_mask;
+
+                dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data);
+        }
+        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n",
+               subs_mask, debug_mask);
+}
+
+int jt_dbg_list(int argc, char **argv)
+{
+        int i;
+
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s <subs || types>\n", argv[0]);
+                return 0;
+        }
+
+        if (strcasecmp(argv[1], "subs") == 0) {
+                printf("Subsystems: all_subs");
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++)
+                        printf(", %s", portal_debug_subsystems[i]);
+                printf("\n");
+        } else if (strcasecmp(argv[1], "types") == 0) {
+                printf("Types: all_types");
+                for (i = 0; portal_debug_masks[i] != NULL; i++)
+                        printf(", %s", portal_debug_masks[i]);
+                printf("\n");
+        }
+        else if (strcasecmp(argv[1], "applymasks") == 0) {
+                unsigned int subsystem_mask = 0;
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                        if (subsystem_array[i]) subsystem_mask |= (1 << i);
+                }
+                applymask_all(subsystem_mask, debug_mask);
+        }
+        return 0;
+}
+
+/* if 'raw' is true, don't strip the debug information from the front of the
+ * lines */
+static void dump_buffer(FILE *fd, char *buf, int size, int raw)
+{
+        char *p, *z;
+        unsigned long subsystem, debug, dropped = 0, kept = 0;
+        int max_sub, max_type;
+
+        for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++)
+                ;
+        for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++)
+                ;
+
+        while (size) {
+                p = memchr(buf, '\n', size);
+                if (!p)
+                        break;
+                subsystem = strtoul(buf, &z, 16);
+                debug = strtoul(z + 1, &z, 16);
+
+                z++;
+                /* for some reason %*s isn't working. */
+                *p = '\0';
+                if (subsystem < max_sub &&
+                    subsystem_array[subsystem] &&
+                    (!debug || (debug_mask & debug))) {
+                        if (raw)
+                                fprintf(fd, "%s\n", buf);
+                        else
+                                fprintf(fd, "%s\n", z);
+                        //printf("%s\n", buf);
+                        kept++;
+                } else {
+                        //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf);
+                        dropped++;
+                }
+                *p = '\n';
+                p++;
+                size -= (p - buf);
+                buf = p;
+        }
+
+        printf("Debug log: %lu lines, %lu kept, %lu dropped.\n",
+                dropped + kept, kept, dropped);
+}
+
+int jt_dbg_debug_kernel(int argc, char **argv)
+{
+        int rc, raw = 1;
+        FILE *fd = stdout;
+        const int databuf_size = (6 << 20);
+        struct portal_ioctl_data data, *newdata;
+        char *databuf = NULL;
+
+        if (argc > 3) {
+                fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc > 1) {
+                fd = fopen(argv[1], "w");
+                if (fd == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                                strerror(errno));
+                        return -1;
+                }
+        }
+        if (argc > 2)
+                raw = atoi(argv[2]);
+
+        databuf = malloc(databuf_size);
+        if (!databuf) {
+                fprintf(stderr, "No memory for buffer.\n");
+                goto out;
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_plen1 = databuf_size;
+        data.ioc_pbuf1 = databuf;
+
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                goto out;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n",
+                        strerror(errno));
+                goto out;
+        }
+
+        newdata = (struct portal_ioctl_data *)buf;
+        if (newdata->ioc_size > 0)
+                dump_buffer(fd, databuf, newdata->ioc_size, raw);
+        else
+                fprintf(stderr, "No data in the debug buffer.\n");
+
+ out:
+        if (databuf)
+                free(databuf);
+        if (fd != stdout)
+                fclose(fd);
+        return 0;
+}
+
+int jt_dbg_debug_daemon(int argc, char **argv)
+{
+        int i, rc;
+        unsigned int cmd = 0;
+        FILE *fd = stdout;
+        struct portal_ioctl_data data;
+
+        if (argc <= 1) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) {
+                if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) {
+                        cmd = portal_debug_daemon_cmd[i].cmdv;
+                        break;
+                }
+        }
+        if (portal_debug_daemon_cmd[i].cmd == NULL) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        memset(&data, 0, sizeof(data));
+        if (cmd == DEBUG_DAEMON_START) {
+                if (argc < 3) {
+                        fprintf(stderr, "usage: %s [start file <#MB>|stop|"
+                                "pause|continue]\n", argv[0]);
+                        return 0;
+                }
+                if (access(argv[2], F_OK) != 0) {
+                        fd = fopen(argv[2], "w");
+                        if (fd != NULL) {
+                                fclose(fd);
+                                remove(argv[2]);
+                                goto ok;
+                        }
+                }
+                if (access(argv[2], W_OK) == 0)
+                        goto ok;
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                        strerror(errno));
+                return -1;
+ok:
+                data.ioc_inllen1 = strlen(argv[2]) + 1;
+                data.ioc_inlbuf1 = argv[2];
+                data.ioc_misc = 0;
+                if (argc == 4) {
+                        unsigned long size;
+                        errno = 0;
+                        size = strtoul(argv[3], NULL, 0);
+                        if (errno) {
+                                fprintf(stderr, "file size(%s): error %s\n",
+                                        argv[3], strerror(errno));
+                                return -1;
+                        }
+                        data.ioc_misc = size;
+                }
+        }
+        data.ioc_count = cmd;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf);
+        if (rc < 0) {
+                fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n",
+                                strerror(errno));
+                return rc;
+        }
+        return 0;
+}
+
+int jt_dbg_debug_file(int argc, char **argv)
+{
+        int rc, fd = -1, raw = 1;
+        FILE *output = stdout;
+        char *databuf = NULL;
+        struct stat statbuf;
+
+        if (argc > 4 || argc < 2) {
+                fprintf(stderr, "usage: %s <input> [output] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        fd = open(argv[1], O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                        strerror(errno));
+                return -1;
+        }
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+        rc = syscall(__SYS_fstat__, fd, &statbuf);
+        if (rc < 0) {
+                fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        if (argc >= 3) {
+                output = fopen(argv[2], "w");
+                if (output == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                                strerror(errno));
+                        goto out;
+                }
+        }
+
+        if (argc == 4)
+                raw = atoi(argv[3]);
+
+        databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+        if (databuf == NULL) {
+                fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        dump_buffer(output, databuf, statbuf.st_size, raw);
+
+ out:
+        if (databuf)
+                munmap(databuf, statbuf.st_size);
+        if (output != stdout)
+                fclose(output);
+        if (fd > 0)
+                close(fd);
+        return 0;
+}
+
+int jt_dbg_clear_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_dbg_mark_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+        char *text;
+        time_t now = time(NULL);
+
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [marker text]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc == 2) {
+                text = argv[1];
+        } else {
+                text = ctime(&now);
+                text[strlen(text) - 1] = '\0'; /* stupid \n */
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_inllen1 = strlen(text) + 1;
+        data.ioc_inlbuf1 = text;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+
+int jt_dbg_modules(int argc, char **argv)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        struct mod_paths {
+                char *name, *path;
+        } *mp, mod_paths[] = {
+                {"portals", "portals/linux/oslib"},
+                {"ksocknal", "portals/linux/socknal"},
+                {"obdclass", "lustre/obdclass"},
+                {"ptlrpc", "lustre/ptlrpc"},
+                {"obdext2", "lustre/obdext2"},
+                {"ost", "lustre/ost"},
+                {"osc", "lustre/osc"},
+                {"mds", "lustre/mds"},
+                {"mdc", "lustre/mdc"},
+                {"llite", "lustre/llite"},
+                {"obdecho", "lustre/obdecho"},
+                {"ldlm", "lustre/ldlm"},
+                {"obdfilter", "lustre/obdfilter"},
+                {"extN", "lustre/extN"},
+                {"lov", "lustre/lov"},
+                {"fsfilt_ext3", "lustre/obdclass"},
+                {"fsfilt_extN", "lustre/obdclass"},
+                {"mds_ext2", "lustre/mds"},
+                {"mds_ext3", "lustre/mds"},
+                {"mds_extN", "lustre/mds"},
+                {"ptlbd", "lustre/ptlbd"},
+                {NULL, NULL}
+        };
+        char *path = "..";
+        char *kernel = "linux";
+
+        if (argc >= 2)
+                path = argv[1];
+        if (argc == 3)
+                kernel = argv[2];
+        if (argc > 3) {
+                printf("%s [path] [kernel]\n", argv[0]);
+                return 0;
+        }
+
+        for (mp = mod_paths; mp->name != NULL; mp++) {
+                struct module_info info;
+                int rc;
+                size_t crap;
+                int query_module(const char *name, int which, void *buf,
+                                 size_t bufsize, size_t *ret);
+
+                rc = query_module(mp->name, QM_INFO, &info, sizeof(info),
+                                  &crap);
+                if (rc < 0) {
+                        if (errno != ENOENT)
+                                printf("query_module(%s) failed: %s\n",
+                                       mp->name, strerror(errno));
+                } else {
+                        printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path,
+                               mp->path, mp->name,
+                               info.addr + sizeof(struct module));
+                }
+        }
+
+        return 0;
+#else
+        printf("jt_dbg_module is not yet implemented for Linux 2.5\n");
+        return 0;
+#endif /* linux 2.5 */
+}
+
+int jt_dbg_panic(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
diff --git a/lnet/utils/debugctl.c b/lnet/utils/debugctl.c
new file mode 100644 (file)
index 0000000..02cb9b4
--- /dev/null
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+
+command_t list[] = {
+        {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"},
+        {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, 
+        {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file <input> [output] [raw], read debug buffer from input and print it [to output]"},
+        {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"},
+        {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"},
+        {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"},
+        {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"},
+        {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"},
+        {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: <path>)"},
+        {"panic", jt_dbg_panic, 0, "cause the kernel to panic"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (dbg_initialize(argc, argv) < 0)
+                exit(2);
+
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+
+        Parser_init("debugctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        unregister_ioc_dev(PORTALS_DEV_ID);
+        return 0;
+}
diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c
new file mode 100644 (file)
index 0000000..722bb57
--- /dev/null
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+struct ioc_dev {
+       const char * dev_name;
+       int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+struct dump_hdr {
+       int magic;
+       int dev_id;
+       int opc;
+};
+
+char * dump_filename;
+
+static int
+open_ioc_dev(int dev_id) 
+{
+       const char * dev_name;
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       dev_name = ioc_dev_list[dev_id].dev_name;
+       if (dev_name == NULL) {
+                fprintf(stderr, "unknown device id: %d\n", dev_id);
+               return -EINVAL;
+       }
+
+       if (ioc_dev_list[dev_id].dev_fd < 0) {
+               int fd = open(dev_name, O_RDWR);
+               
+               if (fd < 0) {
+                       fprintf(stderr, "opening %s failed: %s\n"
+                               "hint: the kernel modules may not be loaded\n",
+                               dev_name, strerror(errno));
+                       return fd;
+               }
+               ioc_dev_list[dev_id].dev_fd = fd;
+       }
+
+       return ioc_dev_list[dev_id].dev_fd;
+}
+
+
+static int 
+do_ioctl(int dev_id, int opc, void *buf)
+{
+       int fd, rc;
+       
+       fd = open_ioc_dev(dev_id);
+       if (fd < 0) 
+               return fd;
+
+       rc = ioctl(fd, opc, buf);
+       return rc;
+       
+}
+
+static FILE *
+get_dump_file() 
+{
+       FILE *fp = NULL;
+       
+       if (!dump_filename) {
+               fprintf(stderr, "no dump filename\n");
+       } else 
+               fp = fopen(dump_filename, "a");
+       return fp;
+}
+
+/*
+ * The dump file should start with a description of which devices are
+ * used, but for now it will assumed whatever app reads the file will
+ * know what to do. */
+int 
+dump(int dev_id, int opc, void *buf)
+{
+       FILE *fp;
+       struct dump_hdr dump_hdr;
+       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+       int rc;
+       
+       printf("dumping opc %x to %s\n", opc, dump_filename);
+       
+
+       dump_hdr.magic = 0xdeadbeef;
+       dump_hdr.dev_id = dev_id;
+       dump_hdr.opc = opc;
+
+       fp = get_dump_file();
+       if (fp == NULL) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+       if (rc == 1)
+               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+       fclose(fp);
+       if (rc != 1) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       return 0;
+}
+
+/* register a device to send ioctls to.  */
+int 
+register_ioc_dev(int dev_id, const char * dev_name) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       unregister_ioc_dev(dev_id);
+
+       ioc_dev_list[dev_id].dev_name = dev_name;
+       ioc_dev_list[dev_id].dev_fd = -1;
+
+       return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return;
+       if (ioc_dev_list[dev_id].dev_name != NULL &&
+           ioc_dev_list[dev_id].dev_fd >= 0) 
+               close(ioc_dev_list[dev_id].dev_fd);
+
+       ioc_dev_list[dev_id].dev_name = NULL;
+       ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+/* If this file is set, then all ioctl buffers will be 
+   appended to the file. */
+int
+set_ioctl_dump(char * file)
+{
+       if (dump_filename)
+               free(dump_filename);
+       
+       dump_filename = strdup(file);
+       return 0;
+}
+
+int
+l_ioctl(int dev_id, int opc, void *buf)
+{
+       if (dump_filename) 
+               return dump(dev_id, opc, buf);
+       else 
+               return do_ioctl(dev_id, opc, buf);
+}
+
+/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
+ * in the file.  For example:
+ *
+ * parse_dump("lctl.dump", l_ioctl);
+ *
+ * Note: if using l_ioctl, then you also need to register_ioc_dev() for 
+ * each device used in the dump.
+ */
+int 
+parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
+{
+       int fd, line =0;
+       struct stat st;
+       char *buf, *end;
+       
+       fd = syscall(SYS_open, dump_file, O_RDONLY);
+
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+       if (syscall(__SYS_fstat__, fd, &st)) { 
+               perror("stat fails");
+               exit(1);
+       }
+
+       if (st.st_size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
+
+       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = buf + st.st_size;
+       close(fd);
+       while (buf < end) {
+               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+               struct portal_ioctl_hdr * data;
+               char tmp[8096];
+               int rc;
+               
+               line++;
+
+               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+               if (buf + data->ioc_len > end ) {
+                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                               data->ioc_len, end);
+                       return -1;
+               }
+#if 0
+               printf ("dump_hdr: %lx data: %lx\n",
+                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+               
+               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                      data->ioc_len, data->ioc_version);
+#endif
+
+               memcpy(tmp, data, data->ioc_len);
+
+               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+               if (rc) {
+                       printf("failed: %d\n", rc);
+                       exit(1);
+               }
+
+               buf += data->ioc_len + sizeof(*dump_hdr);
+       }
+       return 0;
+}
+
+int 
+jt_ioc_dump(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+       printf("setting dumpfile to: %s\n", argv[1]);
+       
+       set_ioctl_dump(argv[1]);
+       return 0;
+}
diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c
new file mode 100644 (file)
index 0000000..4d93645
--- /dev/null
@@ -0,0 +1,703 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <assert.h>
+
+#include <config.h>
+#ifdef HAVE_LIBREADLINE
+#define        READLINE_LIBRARY
+#include <readline/readline.h>
+#endif
+//extern char **completion_matches __P((char *, rl_compentry_func_t *));
+extern void using_history(void);
+extern void stifle_history(int);
+extern void add_history(char *);
+
+#include "parser.h"
+
+static command_t * top_level;      /* Top level of commands, initialized by
+                                    * InitParser                            */
+static char * parser_prompt = NULL;/* Parser prompt, set by InitParser      */
+static int done;                  /* Set to 1 if user types exit or quit   */
+
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+                   char **prev);
+static void print_commands(char *str, command_t *table);
+
+static char * skipwhitespace(char * s)
+{
+    char * t;
+    int    len;
+
+    len = (int)strlen(s);
+    for (t = s; t <= s + len && isspace(*t); t++);
+    return(t);
+}
+
+
+static char * skiptowhitespace(char * s)
+{
+    char * t;
+
+    for (t = s; *t && !isspace(*t); t++);
+    return(t);
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+    char *arg;
+    int i = 0;
+
+    arg = strtok(line, " \t");
+    if ( arg ) {
+       argv[i] = arg;
+       i++;
+    } else
+       return 0;
+
+    while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) {
+       argv[i] = arg;
+       i++;
+    }
+    return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+       command_t *cmd;
+
+       for (cmd = cmds; cmd->pc_name; cmd++) {
+               if (strcmp(name, cmd->pc_name) == 0)
+                       return cmd;
+       }
+       return NULL;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+       command_t *cmd;
+
+        cmd = Parser_findargcmd(argv[0], cmds);
+       if ( cmd ) {
+               return (cmd->pc_func)(argc, argv);
+       } else {
+               printf("Try interactive use without arguments or use one of:\n");
+               for (cmd = cmds; cmd->pc_name; cmd++)
+                       printf("\"%s\" ", cmd->pc_name);
+               printf("\nas argument.\n");
+       }
+       return -1;
+}
+
+/* returns the command_t * (NULL if not found) corresponding to a
+   _partial_ match with the first token in name.  It sets *next to
+   point to the following token. Does not modify *name. */
+static command_t * find_cmd(char * name, command_t cmds[], char ** next)
+{
+        int    i, len;
+    
+        if (!cmds || !name ) 
+                return NULL;
+    
+        /* This sets name to point to the first non-white space character,
+           and next to the first whitespace after name, len to the length: do
+           this with strtok*/
+        name = skipwhitespace(name);
+        *next = skiptowhitespace(name);
+        len = *next - name;
+        if (len == 0) 
+                return NULL;
+
+        for (i = 0; cmds[i].pc_name; i++) {
+                if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+                        *next = skipwhitespace(*next);
+                        return(&cmds[i]);
+                }
+        }
+        return NULL;
+}
+
+/* Recursively process a command line string s and find the command
+   corresponding to it. This can be ambiguous, full, incomplete,
+   non-existent. */
+static int process(char *s, char ** next, command_t *lookup,
+                  command_t **result, char **prev)
+{
+    *result = find_cmd(s, lookup, next);
+    *prev = s;
+
+        /* non existent */
+        if ( ! *result ) 
+                return CMD_NONE;
+
+        /* found entry: is it ambigous, i.e. not exact command name and
+           more than one command in the list matches.  Note that find_cmd
+           points to the first ambiguous entry */
+        if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) &&
+             find_cmd(s, (*result) + 1, next)) 
+                return CMD_AMBIG;
+
+        /* found a unique command: component or full? */
+        if ( (*result)->pc_func ) {
+                return CMD_COMPLETE;
+        } else {
+                if ( *next == '\0' ) {
+                        return CMD_INCOMPLETE;
+                } else {
+                        return process(*next, next, (*result)->pc_sub_cmd, result, prev);
+                }
+        }
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t * match_tbl;   /* Command completion against this table */
+static char * command_generator(const char * text, int state)
+{
+        static int index,
+                len;
+        char       *name;
+
+        /* Do we have a match table? */
+        if (!match_tbl)
+                return NULL;
+
+        /* If this is the first time called on this word, state is 0 */
+        if (!state) {
+                index = 0;
+                len = (int)strlen(text);
+        }
+
+        /* Return next name in the command list that paritally matches test */
+        while ( (name = (match_tbl + index)->pc_name) ) {
+                index++;
+
+                if (strncasecmp(name, text, len) == 0) {
+                        return(strdup(name));
+                }
+        }
+
+    /* No more matches */
+    return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(char * text, int start, int end)
+{
+    command_t  * table;
+    char       * pos;
+
+    match_tbl = top_level;
+    for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+        table;
+        table = find_cmd(pos, match_tbl, &pos)) {
+
+       if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd;
+    }
+
+    return(completion_matches(text, command_generator));
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char * line)
+{
+        command_t         *cmd, *ambig;
+        char *prev;
+        char *next, *tmp;
+        char *argv[MAXARGS];
+        int         i;
+        int rc = 0;
+
+        switch( process(line, &next, top_level, &cmd, &prev) ) {
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, cmd, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        cmd = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "No such command, type help\n");
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_COMPLETE:
+                i = line2args(line, argv, MAXARGS);
+                rc = (cmd->pc_func)(i, argv);
+
+                if (rc == CMD_HELP)
+                        fprintf(stderr, "%s\n", cmd->pc_help);
+
+                break;
+        }
+
+        return rc;
+}
+
+int
+noop_fn ()
+{
+        return (0);
+}
+
+/* just in case you're ever in an airplane and discover you 
+   forgot to install readline-dev. :) */
+int init_input() 
+{
+        int   interactive = isatty (fileno (stdin));
+
+#ifdef HAVE_LIBREADLINE
+        using_history();
+        stifle_history(HISTORY);
+
+        if (!interactive)
+        {
+                rl_prep_term_function = (rl_vintfunc_t *)noop_fn;
+                rl_deprep_term_function = (rl_voidfunc_t *)noop_fn;
+        }
+
+        rl_attempted_completion_function = (CPPFunction *)command_completion;
+        rl_completion_entry_function = (void *)command_generator;
+#endif 
+        return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char * readline(char * prompt) 
+{
+        char line[2048];
+        int n = 0;
+        if (prompt)
+                printf ("%s", prompt);
+        if (fgets(line, sizeof(line), stdin) == NULL)
+                return (NULL);
+        n = strlen(line);
+        if (n && line[n-1] == '\n')
+                line[n-1] = '\0';
+        return strdup(line);
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+        char *line, *s;
+        int rc = 0;
+        int interactive;
+        
+        interactive = init_input();
+
+        while(!done) {
+                line = readline(interactive ? parser_prompt : NULL);
+
+                if (!line) break;
+
+                s = skipwhitespace(line);
+
+                if (*s) {
+                        add_history(s);
+                        rc = execute_line(s);
+                }
+                
+                free(line);
+        }
+        return rc;
+}
+
+
+/* sets the parser prompt */
+void Parser_init(char * prompt, command_t * cmds)
+{
+    done = 0;
+    top_level = cmds;
+    if (parser_prompt) free(parser_prompt);
+    parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+    done = 1;
+    free(parser_prompt);
+    parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+    int ret;
+
+    if (*s != '0')
+       ret = sscanf(s, "%d", val);
+    else if (*(s+1) != 'x')
+       ret = sscanf(s, "%o", val);
+    else {
+       s++;
+       ret = sscanf(++s, "%x", val);
+    }
+
+    return(ret);
+}
+
+
+void Parser_qhelp(int argc, char *argv[]) {
+
+    printf("Available commands are:\n");
+
+    print_commands(NULL, top_level);
+    printf("For more help type: help command-name\n");
+}
+
+int Parser_help(int argc, char **argv) 
+{
+        char line[1024];
+        char *next, *prev, *tmp;
+        command_t *result, *ambig;
+        int i;
+
+        if ( argc == 1 ) {
+                Parser_qhelp(argc, argv);
+                return 0;
+        }
+
+        line[0]='\0';
+        for ( i = 1 ;  i < argc ; i++ ) {
+                strcat(line, argv[i]);
+        }
+
+        switch ( process(line, &next, top_level, &result, &prev) ) {
+        case CMD_COMPLETE:
+                fprintf(stderr, "%s: %s\n",line, result->pc_help);
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "%s: Unknown command.\n", line);
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; result->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, result, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        result = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        }
+        return 0;
+}  
+
+
+void Parser_printhelp(char *cmd)
+{
+        char *argv[] = { "help", cmd }; 
+        Parser_help(2, argv);
+}
+
+/*************************************************************************
+ * COMMANDS                                                             *
+ *************************************************************************/
+
+
+static void print_commands(char * str, command_t * table) {
+    command_t * cmds;
+    char       buf[80];
+
+    for (cmds = table; cmds->pc_name; cmds++) {
+       if (cmds->pc_func) {
+           if (str) printf("\t%s %s\n", str, cmds->pc_name);
+           else printf("\t%s\n", cmds->pc_name);
+       }
+       if (cmds->pc_sub_cmd) {
+           if (str) {
+               sprintf(buf, "%s %s", str, cmds->pc_name);
+               print_commands(buf, cmds->pc_sub_cmd);
+           } else {
+               print_commands(cmds->pc_name, cmds->pc_sub_cmd);
+           }
+       }
+    }
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+                   size_t len)
+{
+    char *line = NULL;
+    int size = strlen(prompt) + strlen(deft) + 8;
+    char *theprompt;
+    theprompt = malloc(size);
+    assert(theprompt);
+
+    sprintf(theprompt, "%s [%s]: ", prompt, deft);
+
+    line  = readline(theprompt);
+    free(theprompt);
+
+    if ( line == NULL || *line == '\0' ) {
+       strncpy(res, deft, len);
+    } else {
+       strncpy(res, line, len);
+    }
+
+    if ( line ) {
+       free(line);
+       return res;
+    } else {
+       return NULL;
+    }
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+    int rc;
+    long result;
+    char *line;
+    int size = strlen(prompt) + 40;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+    sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+
+    fflush(stdout);
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( !line ) {
+           fprintf(stdout, "Please enter an integer.\n");
+           fflush(stdout);
+           continue;
+       }
+       if ( *line == '\0' ) {
+           free(line);
+           result =  deft;
+           break;
+       }
+       rc = Parser_arg2int(line, &result, base);
+       free(line);
+       if ( rc != 0 ) {
+           fprintf(stdout, "Invalid string.\n");
+           fflush(stdout);
+       } else if ( result > max || result < min ) {
+           fprintf(stdout, "Error: response must lie between %ld and %ld.\n",
+                   min, max);
+           fflush(stdout);
+       } else {
+           break;
+       }
+    } while ( 1 ) ;
+
+    if (theprompt)
+       free(theprompt);
+    return result;
+
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+    int result = 0;
+    char *line;
+    int size = strlen(prompt) + 8;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+
+    fflush(stdout);
+
+    if ( deft != 0 && deft != 1 ) {
+       fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n",
+               deft);
+       assert ( 0 );
+    }
+    sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y");
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( line == NULL ) {
+           result = deft;
+           break;
+       }
+       if ( *line == '\0' ) {
+           result = deft;
+           break;
+       }
+       if ( *line == 'y' || *line == 'Y' ) {
+           result = 1;
+           break;
+       }
+       if ( *line == 'n' || *line == 'N' ) {
+           result = 0;
+           break;
+       }
+       if ( line )
+           free(line);
+       fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+       fflush(stdout);
+    } while ( 1 );
+
+    if ( line )
+       free(line);
+    if ( theprompt )
+       free(theprompt);
+    return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                 int min, int max, int base)
+{
+    long result;
+    int rc;
+
+    rc = Parser_arg2int(inp, &result, base);
+
+    if ( rc == 0 ) {
+       return result;
+    } else {
+       return Parser_getint(prompt, deft, min, max, base);
+    }
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len)
+{
+    if ( inp == NULL || *inp == '\0' ) {
+       return Parser_getstr(prompt, deft, answer, len);
+    } else
+       return inp;
+}
+
+/* change a string into a number: return 0 on success. No invalid characters
+   allowed. The processing of base and validity follows strtol(3)*/
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+    char *endptr;
+
+    if ( (base !=0) && (base < 2 || base > 36) )
+       return 1;
+
+    *result = strtol(inp, &endptr, base);
+
+        if ( *inp != '\0' && *endptr == '\0' )
+                return 0;
+        else 
+                return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+        
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+        
+        return (-1);
+}
+
+int Parser_quit(int argc, char **argv)
+{
+        argc = argc;
+        argv = argv;
+        done = 1;
+        return 0;
+}
diff --git a/lnet/utils/parser.h b/lnet/utils/parser.h
new file mode 100644 (file)
index 0000000..dead9f5
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY        100             /* Don't let history grow unbounded    */
+#define MAXARGS 100
+
+#define CMD_COMPLETE   0
+#define CMD_INCOMPLETE 1
+#define CMD_NONE       2
+#define CMD_AMBIG      3
+#define CMD_HELP       4
+
+typedef struct parser_cmd {
+       char    *pc_name;
+       int     (* pc_func)(int, char **);
+       struct parser_cmd * pc_sub_cmd;
+       char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+       char    *ac_name;
+       int      (*ac_func)(int, char **);
+       char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+       char    *type;
+       char    *server;
+       int     port;
+} network_t;
+
+int  Parser_quit(int argc, char **argv);
+void Parser_init(char *, command_t *); /* Set prompt and load command list */
+int Parser_commands(void);                     /* Start the command parser */
+void Parser_qhelp(int, char **);       /* Quick help routine */
+int Parser_help(int, char **);         /* Detailed help routine */
+void Parser_printhelp(char *);         /* Detailed help routine */
+void Parser_exit(int, char **);                /* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+                   size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+                 int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                  int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(int *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c
new file mode 100644 (file)
index 0000000..8235271
--- /dev/null
@@ -0,0 +1,1005 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <asm/byteorder.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+#include "parser.h"
+
+unsigned int portal_debug;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+
+static ptl_nid_t g_nid = 0;
+static unsigned int g_nal = 0;
+static unsigned short g_port = 0;
+
+static int g_socket_txmem = 0;
+static int g_socket_rxmem = 0;
+static int g_socket_nonagle = 1;
+
+typedef struct
+{
+        char *name;
+        int   num;
+} name2num_t;
+
+static name2num_t nalnames[] = {
+        {"tcp",                SOCKNAL},
+        {"toe",                TOENAL},
+        {"elan",       QSWNAL},
+        {"gm",         GMNAL},
+        {"scimac",      SCIMACNAL},
+        {NULL,         -1}
+};
+
+static name2num_t *
+name2num_lookup_name (name2num_t *table, char *str)
+{
+        while (table->name != NULL)
+                if (!strcmp (str, table->name))
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+static name2num_t *
+name2num_lookup_num (name2num_t *table, int num)
+{
+        while (table->name != NULL)
+                if (num == table->num)
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+int
+ptl_name2nal (char *str)
+{
+        name2num_t *e = name2num_lookup_name (nalnames, str);
+
+        return ((e == NULL) ? 0 : e->num);
+}
+
+static char *
+nal2name (int nal)
+{
+        name2num_t *e = name2num_lookup_num (nalnames, nal);
+
+        return ((e == NULL) ? "???" : e->name);
+}
+
+static int
+nid2nal (ptl_nid_t nid)
+{
+        /* BIG pragmatic assumption */
+        return ((((__u32)nid) & 0xffff0000) != 0 ? SOCKNAL : QSWNAL);
+}
+
+int
+ptl_parse_nid (ptl_nid_t *nidp, char *str)
+{
+        struct hostent *he;
+        int             a;
+        int             b;
+        int             c;
+        int             d;
+        
+        if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 &&
+            (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+            (c & ~0xff) == 0 && (d & ~0xff) == 0)
+        {
+                __u32 addr = (a<<24)|(b<<16)|(c<<8)|d;
+
+                *nidp = (ptl_nid_t)addr;
+                return (0);
+        }
+        
+        if ((('a' <= str[0] && str[0] <= 'z') ||
+             ('A' <= str[0] && str[0] <= 'Z')) &&
+             (he = gethostbyname (str)) != NULL)
+        {
+                __u32 addr = *(__u32 *)he->h_addr;
+
+                *nidp = (ptl_nid_t)ntohl(addr);  /* HOST byte order */
+                return (0);
+        }
+
+        if (sscanf (str, "%i", &a) == 1)
+        {
+                *nidp = (ptl_nid_t)a;
+                return (0);
+        }
+
+        if (sscanf (str, "%x", &a) == 1)
+        {
+                *nidp = (ptl_nid_t) a;
+                return (0);
+        }
+
+        return (-1);
+}
+
+char *
+ptl_nid2str (char *buffer, ptl_nid_t nid)
+{
+        switch (nid2nal(nid))
+        {
+        case QSWNAL:
+                sprintf (buffer, LPD64, nid);
+                return (buffer);
+
+        case SCIMACNAL:
+                sprintf (buffer, LPX64, nid);
+                return (buffer);
+                
+        case SOCKNAL: {
+                __u32           addr = htonl((__u32)nid); /* back to NETWORK byte order */
+                struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET);
+                
+                if (he != NULL)
+                        strcpy (buffer, he->h_name);
+                else
+                {
+                        addr = (__u32)nid;
+                        sprintf (buffer, "%d.%d.%d.%d", 
+                                 (addr>>24)&0xff, (addr>>16)&0xff, (addr>>8)&0xff, addr&0xff);
+                }
+                return (buffer);
+        }
+        
+        default:
+                sprintf (buffer, "nid2nal broken");
+                return (buffer);
+        }
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int ptl_initialize(int argc, char **argv) 
+{
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+        return 0;
+}
+
+
+int jt_ptl_network(int argc, char **argv)
+{
+        int  nal;
+        
+        if (argc != 2 ||
+            (nal = ptl_name2nal (argv[1])) == 0)
+        {
+                name2num_t *entry;
+                
+                fprintf(stderr, "usage: %s \n", argv[0]);
+                for (entry = nalnames; entry->name != NULL; entry++)
+                        fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name);
+                fprintf(stderr, ">\n");
+        }
+        else
+                g_nal = nal;
+
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+int jt_ptl_connect(int argc, char **argv)
+{
+        if (argc < 2) {
+        usage:
+                fprintf(stderr, "usage: %s <hostname port [xi]> or <elan ID>\n",
+                        argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                ptl_nid_t peer_nid;
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                struct sockaddr_in srvaddr;
+                char *flag;
+                int fd, rc;
+                int nonagle = 0;
+                int rxmem = 0;
+                int txmem = 0;
+                int bind_irq = 0;
+                int xchange_nids = 0;
+                int o;
+                int olen;
+                
+                if (argc < 3) {
+                        goto usage;
+                }
+
+                he = gethostbyname(argv[1]);
+                if (!he) {
+                        fprintf(stderr, "gethostbyname error: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                g_port = atol(argv[2]);
+
+                if (argc > 3)
+                        for (flag = argv[3]; *flag != 0; flag++)
+                                switch (*flag)
+                                {
+                                case 'i':
+                                        bind_irq = 1;
+                                        break;
+                                        
+                                case 'x':
+                                        xchange_nids = 1;
+                                        break;
+
+                                default:
+                                        fprintf (stderr, "unrecognised flag '%c'\n",
+                                                 *flag);
+                                        return (-1);
+                                }
+                
+                memset(&srvaddr, 0, sizeof(srvaddr));
+                srvaddr.sin_family = AF_INET;
+                srvaddr.sin_port = htons(g_port);
+                srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr;
+        
+                fd = socket(PF_INET, SOCK_STREAM, 0);
+                if ( fd < 0 ) {
+                        fprintf(stderr, "socket() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                if (g_socket_nonagle)
+                {
+                        o = 1;
+                        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_rxmem != 0)
+                {
+                        o = g_socket_rxmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_txmem != 0)
+                {
+                        o = g_socket_txmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                if ( rc == -1 ) { 
+                        fprintf(stderr, "connect() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                olen = sizeof (txmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0)
+                        fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno));
+                olen = sizeof (rxmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0)
+                        fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno));
+                olen = sizeof (nonagle);
+                if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0)
+                        fprintf (stderr, "Can't get nagle: %s\n", strerror (errno));
+
+                if (xchange_nids) {
+                        
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = g_nal;
+                        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+                        if (rc != 0)
+                        {
+                                fprintf (stderr, "failed to get my nid: %s\n",
+                                         strerror (errno));
+                                close (fd);
+                                return (-1);
+                        }
+                        
+                        rc = exchange_nids (fd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (fd);
+                                return (-1);
+                        }
+                }
+                else
+                        peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */
+
+                printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1],
+                       peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled");
+
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = fd;
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to register fd with portals: "
+                                "%s\n", strerror(errno));
+                        close (fd);
+                        return -1;
+                }
+
+                g_nid = peer_nid;
+                printf("Connection to "LPX64" registered with socknal\n", g_nid);
+
+                rc = close(fd);
+                if (rc) {
+                        fprintf(stderr, "close failed: %d\n", rc);
+                }
+        } else if (g_nal == QSWNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == GMNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == SCIMACNAL) {
+                unsigned int    tmpnid;
+                if(sscanf(argv[1], "%x", &tmpnid) == 1) {
+                        g_nid=tmpnid;
+                }
+                else {
+                        fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]);
+                }
+
+
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+        }
+
+        return 0;
+}
+
+int jt_ptl_disconnect(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Disconnecting ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to remove connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_push_connection (int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Pushing ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to push connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'push' doesn't make any sense for elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'push' doesn't make any sense for GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'push' doesn't make any sense for SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_ping(int argc, char **argv)
+{
+        int       rc;
+        ptl_nid_t nid;
+        long      count   = 1;
+        long      size    = 4;
+        long      timeout = 1;
+        struct portal_ioctl_data data;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]);
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+        
+        if (argc > 2)
+        {
+                count = atol(argv[2]);
+
+                if (count < 0 || count > 20000) 
+                {
+                        fprintf(stderr, "are you insane?  %ld is a crazy count.\n", count);
+                        return -1;
+                }
+        }
+        
+        if (argc > 3)
+                size= atol(argv[3]);
+
+        if (argc > 4)
+                timeout = atol (argv[4]);
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_count   = count;
+        data.ioc_size    = size;
+        data.ioc_nid     = nid;
+        data.ioc_nal     = g_nal;
+        data.ioc_timeout = timeout;
+        
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data);
+        if (rc) {
+                fprintf(stderr, "failed to start pinger: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_ptl_mynid(int argc, char **argv)
+{
+        int rc;
+        struct hostent *h;
+        char buf[1024], *hostname;
+        struct portal_ioctl_data data;
+        ptl_nid_t mynid;
+        
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                fprintf(stderr, "hostname defaults to the hostname of the "
+                        "machine.\n");
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (g_nal == QSWNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for elan.\n");
+                return -1;
+        } else  if (g_nal == GMNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for GM.\n");
+                return -1;
+        } else  if (g_nal == SCIMACNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for SCI.\n");
+                return -1;
+        } 
+        
+        if (g_nal != SOCKNAL && g_nal != TOENAL) {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        if (argc == 1) {
+                if (gethostname(buf, sizeof(buf)) != 0) {
+                        fprintf(stderr, "gethostname failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+                hostname = buf;
+        } else {
+                hostname = argv[1];
+        }
+
+        h = gethostbyname(hostname);
+
+        if (!h) {
+                fprintf(stderr, "cannot get address for host '%s': %d\n",
+                        hostname, h_errno);
+                return -1;
+        }
+        mynid = (ptl_nid_t)ntohl (*(__u32 *)h->h_addr);      /* HOST byte order */
+        
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = mynid;
+        data.ioc_nal = g_nal;
+        data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+        if (rc < 0)
+                fprintf(stderr, "IOC_PORTAL_REGISTER_MYNID failed: %s\n",
+                       strerror(errno));
+        else
+                printf("registered my nid "LPX64" (%s)\n", mynid, hostname);
+        return 0;
+}
+
+int
+jt_ptl_fail_nid (int argc, char **argv)
+{
+        int                      rc;
+        ptl_nid_t                nid;
+        unsigned int             threshold;
+        struct portal_ioctl_data data;
+
+        if (argc < 2 || argc > 3)
+        {
+                fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]);
+                return (0);
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (!strcmp (argv[1], "_all_"))
+                nid = PTL_NID_ANY;
+        else if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (argc < 3)
+                threshold = PTL_MD_THRESH_INF;
+        else if (sscanf (argv[2], "%i", &threshold) != 1) {
+                fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]);
+                return (-1);
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        data.ioc_nid = nid;
+        data.ioc_count = threshold;
+        
+        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data);
+        if (rc < 0)
+                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
+                         strerror (errno));
+        else
+                printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]);
+        
+        return (0);
+}
+
+int
+jt_ptl_rxmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+
+                g_socket_rxmem = size;
+        }
+        printf ("Socket rmem = %d\n", g_socket_rxmem);        
+        return (0);
+}
+
+int
+jt_ptl_txmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_txmem = size;
+        }
+        printf ("Socket txmem = %d\n", g_socket_txmem);
+        return (0);
+}
+
+int
+jt_ptl_nagle (int argc, char **argv)
+{
+        int enable;
+
+        if (argc > 1)
+        {
+                if (Parser_bool (&enable, argv[1]) != 0)
+                {
+                        fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_nonagle = !enable;
+        }
+        printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled");
+        return (0);
+}
+
+int
+jt_ptl_add_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        ptl_nid_t                gateway_nid;
+        int                      gateway_nal;
+        int                      rc;
+        
+        if (argc < 3)
+        {
+                fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        gateway_nal = nid2nal (gateway_nid);
+
+        if (ptl_parse_nid (&nid1, argv[2]) != 0)
+        {
+                fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]);
+                return (-1);
+        }
+
+        if (argc < 4)
+                nid2 = nid1;
+        else if (ptl_parse_nid (&nid2, argv[3]) != 0)
+        {
+                fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = gateway_nid;
+        data.ioc_nal = gateway_nal;
+        data.ioc_nid2 = MIN (nid1, nid2);
+        data.ioc_nid3 = MAX (nid1, nid2);
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_del_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid;
+        int                      rc;
+        
+        if (argc < 2)
+        {
+                fprintf (stderr, "usage: %s targetNID\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = nid;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_print_routes (int argc, char **argv)
+{
+        char                      buffer[3][128];
+        struct portal_ioctl_data  data;
+        int                       rc;
+        int                       index;
+        int                      gateway_nal;
+        ptl_nid_t                gateway_nid;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        
+        
+        for (index = 0;;index++)
+        {
+                PORTAL_IOC_INIT(data);
+                data.ioc_count = index;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data);
+                if (rc != 0)
+                        break;
+
+                gateway_nal = data.ioc_nal;
+                gateway_nid = data.ioc_nid;
+                nid1 = data.ioc_nid2;
+                nid2 = data.ioc_nid3;
+                
+                printf ("%8s %18s : %s - %s\n", 
+                        nal2name (gateway_nal), 
+                        ptl_nid2str (buffer[0], gateway_nid),
+                        ptl_nid2str (buffer[1], nid1),
+                        ptl_nid2str (buffer[2], nid2));
+        }
+        return (0);
+}
+
diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c
new file mode 100644 (file)
index 0000000..d38bd4a
--- /dev/null
@@ -0,0 +1,64 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+#include "parser.h"
+
+
+command_t list[] = {
+        {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
+        {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: <hostname port> | <id> for tcp/elan respectively)"},
+        {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"},
+        {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"},
+        {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
+        {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
+        {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+        {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+        {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
+        {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
+        {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
+        {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (ptl_initialize(argc, argv) < 0)
+                exit(1);
+
+        Parser_init("ptlctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        return 0;
+}
diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c
new file mode 100644 (file)
index 0000000..37da12c
--- /dev/null
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+double
+timenow ()
+{
+   struct timeval tv;
+   
+   gettimeofday (&tv, NULL);
+   return (tv.tv_sec + tv.tv_usec / 1000000.0);
+}
+
+void
+do_stat (int fd)
+{
+   static char  buffer[1024];
+   static double last = 0.0;
+   double now;
+   double t;
+   long long bytes;
+   long      packets;
+   long      errors;
+   long      depth;
+   int    n;
+   
+   lseek (fd, 0, SEEK_SET);
+   now = timenow();
+   n = read (fd, buffer, sizeof (buffer));
+   if (n < 0)
+   {
+      fprintf (stderr, "Can't read statfile\n");
+      exit (1);
+   }    
+   buffer[n] = 0;
+   
+   n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth);
+   
+   if (n < 3)
+   {
+      fprintf (stderr, "Can't parse statfile\n");
+      exit (1);
+   }
+   
+   if (last == 0.0)
+      printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", 
+             bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors);
+   else
+   {
+      t = now - last;
+
+      printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", 
+             bytes, ((double)bytes)/((1<<20) * t),
+             packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t),
+             errors, (long)(errors/t));
+   }
+
+   if (n == 4)
+      printf (" (%ld)\n", depth);
+   else
+      printf ("\n");
+
+   fflush (stdout);
+   
+   lseek (fd, 0, SEEK_SET);
+   write (fd, "\n", 1);
+   last = timenow();
+}
+
+int main (int argc, char **argv)
+{
+   int  interval = 0;
+   int  fd;
+   
+   if (argc > 1)
+      interval = atoi (argv[1]);
+
+   fd = open ("/proc/sys/portals/router", O_RDWR);
+   if (fd < 0)
+   {
+      fprintf (stderr, "Can't open stat: %s\n", strerror (errno));
+      return (1);
+   }
+   
+   do_stat (fd);
+   if (interval == 0)
+      return (0);
+   
+   for (;;)
+   {
+      sleep (interval);
+      do_stat (fd);
+   }
+}
diff --git a/lustre/Makefile.mk b/lustre/Makefile.mk
new file mode 100644 (file)
index 0000000..e540148
--- /dev/null
@@ -0,0 +1,4 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += mds/
diff --git a/lustre/mds/Makefile.mk b/lustre/mds/Makefile.mk
new file mode 100644 (file)
index 0000000..6b712fb
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += mds.o
+
+mds-objs    := mds_lov.o handler.o mds_reint.o mds_fs.o lproc_mds.o mds_internal.h mds_updates.o mds_open.o simple.o target.o
diff --git a/lustre/portals/AUTHORS b/lustre/portals/AUTHORS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/ChangeLog b/lustre/portals/ChangeLog
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/Kernelenv.in b/lustre/portals/Kernelenv.in
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lustre/portals/Kernelenv.mk b/lustre/portals/Kernelenv.mk
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lustre/portals/Makefile.am b/lustre/portals/Makefile.am
new file mode 100644 (file)
index 0000000..3c42103
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = Rules.linux archdep.m4 MCP
+DIST_SUBDIRS = libcfs portals knals unals utils tests doc router
+SUBDIRS = libcfs portals knals unals utils tests doc router
diff --git a/lustre/portals/Makefile.mk b/lustre/portals/Makefile.mk
new file mode 100644 (file)
index 0000000..be0e51a
--- /dev/null
@@ -0,0 +1,6 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += libcfs/
+obj-y += knals/
+obj-y += router/
diff --git a/lustre/portals/NEWS b/lustre/portals/NEWS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/README b/lustre/portals/README
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/Rules.linux.in b/lustre/portals/Rules.linux.in
new file mode 100644 (file)
index 0000000..8247deb
--- /dev/null
@@ -0,0 +1,37 @@
+# included in Linux kernel directories
+# Rules for module building
+
+MODLINK=@MOD_LINK@
+if LINUX25
+
+
+basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g')
+AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2  -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename)
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+
+
+else
+
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+
+
+endif
+
+
+tags:
+       rm -f $(top_srcdir)/TAGS
+       rm -f $(top_srcdir)/tags
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a
+
+
+
+
diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4
new file mode 100644 (file)
index 0000000..0315644
--- /dev/null
@@ -0,0 +1,206 @@
+
+# -------- in kernel compilation? (2.5 only) -------------
+AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles])
+AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
+echo "Makefile for in kernel build: $INKERNEL"
+
+# -------- liblustre compilation --------------
+AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
+
+# -------- set linuxdir ------------
+
+AC_ARG_WITH(linux, [  --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux)
+AC_SUBST(LINUX)
+
+# --------- UML?  --------------------
+AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
+if test $host_cpu = "lib" ; then 
+        host_cpu="lib"
+       AC_MSG_RESULT(no building Lustre library)
+else
+  if test -e $LINUX/include/asm-um ; then
+    if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
+       host_cpu="um";
+       AC_MSG_RESULT(yes)
+    else
+       AC_MSG_RESULT(no (asm doesn't point at asm-um))
+    fi
+
+  else 
+        AC_MSG_RESULT(no (asm-um missing))
+  fi
+fi
+
+# --------- Linux 25 ------------------
+
+AC_MSG_CHECKING(if you are running linux 2.5)
+if test -e $LINUX/include/linux/namei.h ; then
+        linux25="yes"
+        AC_MSG_RESULT(yes)
+else
+        linux25="no"
+        AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
+echo "Makefiles for in linux 2.5 build: $LINUX25"
+
+# -------  Makeflags ------------------
+
+AC_MSG_CHECKING(setting make flags system architecture: )
+case ${host_cpu} in
+       lib )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall '
+       KCPPFLAGS='-D__arch_lib__ '
+        MOD_LINK=elf_i386
+;;
+       um )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include '
+        ;;
+        esac
+
+        MOD_LINK=elf_i386
+;;
+       i*86 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        ;;
+        esac
+        MOD_LINK=elf_i386
+;;
+
+       alphaev6 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alphaev67 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alpha* )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       ia64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step'
+       KCPPFLAGS='-D__KERNEL__ -DMODULE'
+        MOD_LINK=elf64_ia64
+;;
+
+       sparc64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf64_sparc
+
+;;
+
+       powerpc )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf32ppclinux
+;;
+
+        *)
+       AC_ERROR("Unknown Linux Platform: $host_cpu")
+;;
+esac
+
+# ----------- make dep run? ------------------
+
+if test $host_cpu != "lib" ; then 
+  AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
+  if test -f $LINUX/include/linux/config.h ; then
+  AC_MSG_RESULT(yes)
+ else
+  AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
+  fi
+fi
+
+# ------------ include paths ------------------
+
+if test $host_cpu != "lib" ; then 
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include -I$(LINUX)/include'
+else
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include'
+fi
+CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
+
+if test $host_cpu != "lib" ; then 
+# ------------ autoconf.h ------------------
+  AC_MSG_CHECKING(if autoconf.h is in kernel source)
+  if test -f $LINUX/include/linux/autoconf.h ; then
+      AC_MSG_RESULT(yes)
+  else
+      AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
+  fi
+
+# ------------ RELEASE and moduledir ------------------
+  AC_MSG_CHECKING(for Linux release)
+  
+  dnl We need to rid ourselves of the nasty [ ] quotes.
+  changequote(, )
+  dnl Get release from version.h
+  RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`"
+  changequote([, ])
+  
+  moduledir='$(libdir)/modules/'$RELEASE/kernel
+  AC_SUBST(moduledir)
+  
+  modulefsdir='$(moduledir)/fs/$(PACKAGE)'
+  AC_SUBST(modulefsdir)
+  
+  AC_MSG_RESULT($RELEASE)
+  AC_SUBST(RELEASE)
+
+# ---------- modversions? --------------------
+  AC_MSG_CHECKING(for MODVERSIONS)
+  if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
+  then
+        MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB"
+        AC_MSG_RESULT(yes)
+  else
+        MFLAGS=
+        AC_MSG_RESULT(no)
+  fi
+fi
+
+# ---------- SMP -------------------
+#AC_MSG_CHECKING(for SMP)
+#if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then
+#        SMPFLAG=
+#        AC_MSG_RESULT(yes)
+#else
+#        SMPFLAG=
+#        AC_MSG_RESULT(no)
+#fi
+
+CFLAGS="$KCFLAGS"
+CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS "
+
+AC_SUBST(MOD_LINK)
+AC_SUBST(LINUX25)
\ No newline at end of file
diff --git a/lustre/portals/autogen.sh b/lustre/portals/autogen.sh
new file mode 100755 (executable)
index 0000000..9deed73
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+aclocal &&
+automake --add-missing &&
+${AUTOCONF:-autoconf}
diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4
new file mode 100644 (file)
index 0000000..4e8dbbb
--- /dev/null
@@ -0,0 +1,108 @@
+
+# ----------  directories ---------
+
+
+# ---------  unsigned long long sane? -------
+
+AC_CHECK_SIZEOF(unsigned long long, 0)
+echo "---> size SIZEOF $SIZEOF_unsigned_long_long"
+echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long"
+if test $ac_cv_sizeof_unsigned_long_long != 8 ; then
+        AC_MSG_ERROR([** we assume that sizeof(long long) == 8.  Tell phil@clusterfs.com])
+fi
+
+# directories for binaries
+ac_default_prefix=
+bindir='${exec_prefix}/usr/bin'
+sbindir='${exec_prefix}/usr/sbin'
+includedir='${prefix}/usr/include'
+
+# Directories for documentation and demos.
+docdir='${prefix}/usr/share/doc/$(PACKAGE)'
+AC_SUBST(docdir)
+demodir='$(docdir)/demo'
+AC_SUBST(demodir)
+pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples'
+AC_SUBST(pkgexampledir)
+pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre'
+AC_SUBST(pymoddir)
+modulenetdir='$(moduledir)/net/$(PACKAGE)'
+AC_SUBST(modulenetdir)
+
+
+# ----------  BAD gcc? ------------
+AC_PROG_RANLIB
+AC_PROG_CC
+AC_MSG_CHECKING(for buggy compiler)
+CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"`
+bad_cc() {
+       echo
+       echo "   '$CC_VERSION'"
+       echo "  has been known to generate bad code, "
+       echo "  please get an updated compiler."
+       AC_MSG_ERROR(sorry)
+}
+TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
+if test "$TMP_VERSION" = "gcc version 2.95"; then
+        bad_cc
+fi
+case "$CC_VERSION" in 
+       # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
+       # without "sub    $0xc,%esp" to protect the stack from being
+       # stomped on by interrupts (bug 606)
+       "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)")
+               bad_cc
+               ;;
+       # mandrake's similar sub 0xc compiler bug
+       # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2
+       "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
+               bad_cc
+               ;;
+       *)
+               AC_MSG_RESULT(no known problems)
+               ;;
+esac
+# end ------  BAD gcc? ------------
+
+# --------  Check for required packages  --------------
+
+# this doesn't seem to work on older autoconf
+# AC_CHECK_LIB(readline, readline,,)
+AC_ARG_ENABLE(readline,        [  --enable-readline  use readline library],,
+                       enable_readline="yes")
+if test "$enable_readline" = "yes" ; then
+   LIBREADLINE="-lreadline -lncurses"
+   HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1"
+else 
+   LIBREADLINE=""
+   HAVE_LIBREADLINE=""
+fi
+AC_SUBST(LIBREADLINE)
+AC_SUBST(HAVE_LIBREADLINE)
+
+AC_ARG_ENABLE(efence,  [  --enable-efence  use efence library],,
+                       enable_efence="no")
+if test "$enable_efence" = "yes" ; then
+   LIBEFENCE="-lefence"
+   HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1"
+else 
+   LIBEFENCE=""
+   HAVE_LIBEFENCE=""
+fi
+AC_SUBST(LIBEFENCE)
+AC_SUBST(HAVE_LIBEFENCE)
+
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
+AC_MSG_CHECKING(if you are building lib lustre)
+if test "$host_cpu" = "lib"; then
+   AC_MSG_RESULT(yes)
+   libdir='${exec_prefix}/lib/lustre'
+else
+   AC_MSG_RESULT(no)
+fi
+
+# end -------- Kernel build environment. -----------------
+
+
diff --git a/lustre/portals/configure.in b/lustre/portals/configure.in
new file mode 100644 (file)
index 0000000..7c32246
--- /dev/null
@@ -0,0 +1,38 @@
+# This version is here to make autoconf happy; the name is a file which is
+# "unique" to this directory so that configure knows where it should run.
+AC_INIT(knals/Makefile.am, 3.0)
+AC_CANONICAL_SYSTEM
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+# Automake variables.  Steal the version number from packaging/intersync.spec
+AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c]))
+# AM_MAINTAINER_MODE
+
+sinclude(archdep.m4)
+sinclude(build.m4)
+sinclude<portalsconf.m4)
+
+if test x$enable_inkernel = xyes ; then
+cp Kernelenv.mk Kernelenv.in
+cp Makefile.mk Makefile.in
+cp libcfs/Makefile.mk libcfs/Makefile.in
+cp portals/Makefile.mk portals/Makefile.in
+cp knals/Makefile.mk knals/Makefile.in
+cp knals/socknal/Makefile.mk knals/socknal/Makefile.in
+cp router/Makefile.mk router/Makefile.in
+AC_OUTPUT(Kernelenv)
+fi
+
+
+AM_CONFIG_HEADER(include/config.h)
+
+AC_OUTPUT([Rules.linux Makefile libcfs/Makefile portals/Makefile \
+          unals/Makefile knals/Makefile router/Makefile \
+         knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \
+         knals/scimacnal/Makefile knals/toenal/Makefile \
+          utils/Makefile tests/Makefile doc/Makefile \
+          packaging/Makefile packaging/portals.spec ])
+
diff --git a/lustre/portals/doc/.cvsignore b/lustre/portals/doc/.cvsignore
new file mode 100644 (file)
index 0000000..827dca4
--- /dev/null
@@ -0,0 +1,4 @@
+Makefile
+Makefile.in
+*.eps
+*.pdf
diff --git a/lustre/portals/doc/Data-structures b/lustre/portals/doc/Data-structures
new file mode 100644 (file)
index 0000000..b5532b1
--- /dev/null
@@ -0,0 +1,65 @@
+In this document I will try to draw the data structures and how they
+interrelate in the Portals 3 reference implementation.  It is probably
+best shown with a drawing, so there may be an additional xfig or
+Postscript figure.
+
+
+MEMORY POOLS:
+------------
+
+First, a digression on memory allocation in the library.  As mentioned
+in the NAL Writer's Guide, the library does not link against any
+standard C libraries and as such is unable to dynamically allocate
+memory on its own.  It requires that the NAL implement a method
+for allocation that is appropriate for the protection domain in
+which the library lives.  This is only called when a network
+interface is initialized to allocate the Portals object pools.
+
+These pools are preallocate blocks of objects that the library
+can rapidly make active and manage with a minimum of overhead.
+It is also cuts down on overhead for setting up structures
+since the NAL->malloc() callback does not need to be called
+for each object.
+
+The objects are maintained on a per-object type singly linked free
+list and contain a pointer to the next free object.  This pointer
+is NULL if the object is not on the free list and is non-zero
+if it is on the list.  The special sentinal value of 0xDEADBEEF
+is used to mark the end of the free list since NULL could
+indicate that the last object in the list is not free.
+
+When one of the lib_*_alloc() functions is called, the library
+returns the head of the free list and advances the head pointer
+to the next item on the list.  The special case of 0xDEADBEEF is
+checked and a NULL pointer is returned if there are no more
+objects of this type available.   The lib_*_free() functions
+are even simpler -- check to ensure that the object is not already
+free, set its next pointer to the current head and then set
+the head to be this newly freed object.
+
+Since C does not have templates, I did the next best thing and wrote
+the memory pool allocation code as a macro that expands based on the
+type of the argument.  The mk_alloc(T) macro expands to
+write the _lib_T_alloc() and lib_T_free() functions.
+It requires that the object have a pointer of the type T named
+"next_free".  There are also functions that map _lib_T_alloc()
+to lib_T_alloc() so that the library can add some extra
+functionality to the T constructor.
+
+
+
+LINKED LISTS:
+------------
+
+Many of the active Portals objects are stored in doubly linked lists
+when they are active.  These are always implemented with the pointer
+to the next object and a pointer to the next pointer of the
+previous object.  This avoids the "dummy head" object or
+special cases for inserting at the beginning or end of the list.
+The pointer manipulations are a little hairy at times, but
+I hope that they are understandable.
+
+The actual linked list code is implemented as macros in <lib-p30.h>,
+although the object has to know about 
+
+
diff --git a/lustre/portals/doc/Makefile.am b/lustre/portals/doc/Makefile.am
new file mode 100644 (file)
index 0000000..7c65e6c
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+LYX2PDF = lyx --export pdf
+LYX2TXT = lyx --export text
+LYX2HTML = lyx --export html
+SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps
+
+DOCS = portals3.pdf 
+IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps
+LYXFILES= portals3.lyx
+
+MAINTAINERCLEANFILES =  $(IMAGES) $(DOCS) $(GENERATED)
+GENERATED = 
+EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) 
+
+all: $(DOCS)
+
+# update date and version in document
+date := $(shell date +%x)
+tag := $(shell echo '$$Name:  $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/')
+addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g'
+
+# Regenerate when the $(VERSION) or $Name:  $ changes.
+.INTERMEDIATE: $(GENERATED)
+$(GENERATED) : %.lyx: %.lin Makefile
+       $(addversion) $< > $@
+
+.lyx.pdf:
+       @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n"
+
+.lyx.txt:
+       @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n"
+.lyx.html:
+       @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n"
+.fig.eps:
+       -fig2dev -L eps $< > $@
+
+portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx
+
+syncweb: portals3.pdf
+#      cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf
+#      ( cd /usr/src/www ; make lustre ; make synclustre )
+
diff --git a/lustre/portals/doc/Message-life-cycle b/lustre/portals/doc/Message-life-cycle
new file mode 100644 (file)
index 0000000..e8cc7e2
--- /dev/null
@@ -0,0 +1,118 @@
+This documents the life cycle of message as it arrives and is handled by
+a basic async, packetized NAL.  There are four types of messages that have
+slightly different life cycles, so they are addressed independently.
+
+
+Put request
+-----------
+
+1.  NAL notices that there is a incoming message header on the network
+and reads an ptl_hdr_t in from the wire.
+
+2.  It may store additional NAL specific data that provides context
+for this event in a void* that it will interpret in some fashion
+later.
+
+3.  The NAL calls lib_parse() with a pointer to the header and its
+private data structure.
+
+4.  The library decodes the header and may build a message state
+object that describes the event to be written and the ACK to be
+sent, if any.  It then calls nal->recv() with the private data
+that the NAL passed in, a pointer to the message state object
+and a translated user address.
+
+       The NAL will have been given a chance to pretranslate
+       all user addresses when the buffers are created.  This
+       process is described in the NAL-HOWTO.
+
+5.  The NAL should restore what ever context it required from the
+private data pointer, begin receiving the bytes and possibly store
+some extra state of its own.  It should return at this point.
+
+
+
+Get request
+-----------
+
+1.  As with a Put, the NAL notices the incoming message header and
+passes it to lib_parse().
+
+2.  The library decodes the header and calls nal->recv() with a
+zero byte length, offset and destination to instruct it to clean
+up the wire after reading the header.  The private data will
+be passed in as well, allowing the NAL to retrieve any state
+or context that it requires.
+
+3.  The library may build a message state object to possibly
+write an event log or invalidate a memory region.
+
+4.  The library will build a ptl_msg_t header that specifies the
+Portals protocol information for delivery at the remote end.
+
+5.  The library calls nal->send() with the pre-built header,
+the optional message state object, the four part address
+component, a translated user pointer + offset, and some
+other things.
+
+6.  The NAL is to put the header on the wire or copy it at
+this point (since it off the stack).  It should store some
+amount of state about its current position in the message and
+the destination address.
+
+7.  And then return to the library.
+
+
+Reply request
+-------------
+
+1.  Starting at "The library decodes the header..."
+
+2.  The library decodes the header and calls nal->recv()
+to bring in the rest of the message.  Flow continues in
+exactly the same fashion as with all other receives.
+
+
+Ack request
+-----------
+
+1.  The library decodes the header, builds the appropriate data
+structures for the event in a message state object and calls nal->recv()
+with a zero byte length, etc.
+
+
+Packet arrival
+--------------
+
+1.  The NAL should notice the arrival of a packet, retrieve whatever
+state it needs from the message ID or other NAL specific header data
+and place the data bytes directly into the user address that were
+given to nal->recv().
+
+       How this happens is outside the scope of the Portals library
+       and soley determined by the NAL...
+
+2.  If this is the last packet in a message, the NAL should retrieve
+the lib_msg_t *cookie that it was given in the call to nal->recv()
+and pass it to lib_finalize().  lib_finalize() may call nal->send()
+to send an ACK, nal->write() to record an entry in the event log,
+nal->invalidate() to unregister a region of memory or do nothing at all.
+
+3.  It should then clean up any remaining NAL specific state about
+the message and go back into the main loop.
+
+
+Outgoing packets
+----------------
+
+1.  When the NAL has pending output, it should put the packets on
+the wire wrapped with whatever implementation specified wrappers.
+
+2.  Once it has output all the packets of a message it should
+call lib_finalize() with the message state object that was
+handed to nal->send().  This will allows the library to clean
+up its state regarding the message and write any pending event
+entries.
+
+
+
diff --git a/lustre/portals/doc/NAL-HOWTO b/lustre/portals/doc/NAL-HOWTO
new file mode 100644 (file)
index 0000000..ea38aed
--- /dev/null
@@ -0,0 +1,293 @@
+This document is a first attempt at describing how to write a NAL
+for the Portals 3 library.  It also defines the library architecture
+and the abstraction of protection domains.
+
+
+First, an overview of the architecture:
+
+    Application
+
+----|----+--------
+         |
+   API  === NAL        (User space)
+         |   
+---------+---|-----
+         |    
+   LIB  === NAL        (Library space)
+         |
+---------+---|-----
+          
+    Physical wire      (NIC space)
+          
+
+Application
+    API
+API-side NAL
+------------
+LIB-side NAL
+    LIB
+LIB-side NAL
+   wire
+
+Communication is through the indicated paths via well defined
+interfaces.  The API and LIB portions are written to be portable
+across platforms and do not depend on the network interface.
+
+Communcation between the application and the API code is
+defined in the Portals 3 API specification.  This is the
+user-visible portion of the interface and should be the most
+stable.
+
+
+
+API-side NAL:
+------------
+
+The user space NAL needs to implement only a few functions
+that are stored in a nal_t data structure and called by the
+API-side library:
+
+       int forward( nal_t *nal,
+               int     index,
+               void    *args,
+               size_t  arg_len,
+               void    *ret,
+               size_t  ret_len
+       );
+
+Most of the data structures in the portals library are held in
+the LIB section of the code, so it is necessary to forward API
+calls across the protection domain to the library.  This is
+handled by the NAL's forward method.  Once the argument and return
+blocks are on the remote side the NAL should call lib_dispatch()
+to invoke the appropriate API function.
+
+       int validate( nal_t *nal,
+               void    *base,
+               size_t  extent,
+               void    **trans_base,
+               void    **trans_data
+       );
+
+The validate method provides a means for the NAL to prevalidate
+and possibly pretranslate user addresses into a form suitable
+for fast use by the network card or kernel module.  The trans_base
+pointer will be used by the library everytime it needs to
+refer to the block of memory.  The trans_data result is a
+cookie that will be handed to the NAL along with the trans_base.
+
+The library never performs calculations on the trans_base value;
+it only computes offsets that are then handed to the NAL.
+
+
+       int shutdown( nal_t *nal, int interface );
+
+Brings down the network interface.  The remote NAL side should
+call lib_fini() to bring down the library side of the network.
+
+       void yield( nal_t *nal );
+
+This allows the user application to gracefully give up the processor
+while busy waiting.  Performance critical applications may not
+want to take the time to call this function, so it should be an
+option to the PtlEQWait call.  Right now it is not implemented as such.
+
+Lastly, the NAL must implement a function named PTL_IFACE_*, where
+* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR.
+This initialization function is to set up communication with the
+library-side NAL, which should call lib_init() to bring up the
+network interface.
+
+
+
+LIB-side NAL:
+------------
+
+On the library-side, the NAL has much more responsibility.  It
+is responsible for calling lib_dispatch() on behalf of the user,
+it is also responsible for bringing packets off the wire and
+pushing bits out.  As on the user side, the methods are stored
+in a nal_cb_t structure that is defined on a per network
+interface basis.
+
+The calls to lib_dispatch() need to be examined.  The prototype:
+
+       void    lib_dispatch(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       int                     index,
+                       void                    *arg_block,
+                       void                    *ret_block
+       );
+
+has two complications.  The private field is a NAL-specific
+value that will be passed to any callbacks produced as a result
+of this API call.  Kernel module implementations may use this
+for task structures, or perhaps network card data.  It is ignored
+by the library.
+
+Secondly, the arg_block and ret_block must be in the same protection
+domain as the library.  The NAL's two halves must communicate the
+sizes and perform the copies.  After the call, the buffer pointed
+to by ret_block will be filled in and should be copied back to
+the user space.  How this is to be done is NAL specific.
+
+       int lib_parse(
+                       nal_cb_t                *nal,
+                       ptl_hdr_t               *hdr,
+                       void                    *private
+       );
+
+This is the only other entry point into the library from the NAL.
+When the NAL detects an incoming message on the wire it should read
+sizeof(ptl_hdr_t) bytes and pass a pointer to the header to
+lib_parse().  It may set private to be anything that it needs to
+tie the incoming message to callbacks that are made as a result
+of this event.
+
+The method calls are:
+
+       int     (*send)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       ptl_hdr_t               *hdr,
+                       int                     nid,
+                       int                     pid,
+                       int                     gid,
+                       int                     rid,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  len
+       );
+
+This is a tricky function -- it must support async output
+of messages as well as properly syncronized event log writing.
+The private field is the same that was passed into lib_dispatch()
+or lib_parse() and may be used to tie this call to the event
+that initiated the entry to the library.
+
+The cookie is a pointer to a library private value that must
+be passed to lib_finalize() once the message has been completely
+sent.  It should not be examined by the NAL for any meaning.
+
+The four ID fields are passed in, although some implementations
+may not use all of them.
+
+The single base pointer has been replaced with the translated
+address that the API NAL generated in the api_nal->validate()
+call.  The trans_data is unchanged and the offset is in bytes.
+
+
+       int     (*recv)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  mlen,
+                       size_t                  rlen
+       );
+
+This callback will only be called in response to lib_parse().
+The cookie, trans_addr and trans_data  are as discussed in send().
+The NAL should read mlen bytes from the wire, deposit them into
+trans_base + offset and then discard (rlen - mlen) bytes.
+Once the entire message has been received the NAL should call
+lib_finalize() with the lib_msg_t *cookie.
+
+The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0
+is used to indicate that the NAL should clean up the wire.  This could
+be implemented as a blocking call, although having it return as quickly
+as possible is desirable.
+
+       int     (*write)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       user_ptr                trans_addr,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+
+                       void                    *src_addr,
+                       size_t                  len
+       );
+
+This is essentially a cross-protection domain memcpy().  The user address
+has been pretranslated by the api_nal->translate() call.
+
+       void    *(*malloc)(
+                       nal_cb_t                *nal,
+                       size_t                  len
+       );
+
+       void    (*free)(
+                       nal_cb_t                *nal,
+                       void                    *buf
+       );
+
+Since the NAL may be in a non-standard hosted environment it can
+not call malloc().  This allows the library side NAL to implement
+the system specific malloc().  In the current reference implementation
+the libary only calls nal->malloc() when the network interface is
+initialized and then calls free when it is brought down.  The library
+maintains its own pool of objects for allocation so only one call to
+malloc is made per object type.
+
+       void    (*invalidate)(
+                       nal_cb_t                *nal,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  extent
+       );
+
+User addresses are validated/translated at the user-level API NAL
+method, which is likely to push them to this level.  Meanwhile,
+the library NAL will be notified when the library no longer
+needs the buffer.  Overlapped buffers are not detected by the
+library, so the NAL should ref count each page involved.
+
+Unfortunately we have a few bugs when the invalidate method is
+called.  It is still in progress...
+
+       void    (*printf)(
+                       nal_cb_t                *nal,
+                       const char              *fmt,
+                       ...
+       );
+
+As with malloc(), the library does not have any way to do printf
+or printk.  It is not necessary for the NAL to implement the this
+call, although it will make debugging difficult.
+
+       void    (*cli)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+       void    (*sti)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+These are used by the library to mark critical sections.
+
+       int     (*gidrid2nidpid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                gid,
+                       ptl_id_t                rid,
+                       ptl_id_t                *nid,
+                       ptl_id_t                *pid
+       );
+
+
+       int     (*nidpid2gidrid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                nid,
+                       ptl_id_t                pid,
+                       ptl_id_t                *gid,
+                       ptl_id_t                *rid
+       );
+
+Rolf added these.  I haven't looked at how they have to work yet.
diff --git a/lustre/portals/doc/file.fig b/lustre/portals/doc/file.fig
new file mode 100644 (file)
index 0000000..914c294
--- /dev/null
@@ -0,0 +1,111 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1200 750 1650 1050
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1050 1650 750 1200 750 1200 1050 1650 1050
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001
+-6
+6 1200 2325 1650 2625
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2625 1650 2325 1200 2325 1200 2625 1650 2625
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001
+-6
+6 1200 1800 1650 2100
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2100 1650 1800 1200 1800 1200 2100 1650 2100
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001
+-6
+6 1200 1275 1650 1575
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1575 1650 1275 1200 1275 1200 1575 1650 1575
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001
+-6
+6 450 750 900 1200
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 825 450 1050
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1050 900 825
+-6
+6 450 2325 900 2775
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 2400 450 2625
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2625 900 2400
+-6
+6 450 1800 900 2250
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1875 450 2100
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2100 900 1875
+-6
+6 450 1275 900 1725
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1350 450 1575
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1575 900 1350
+-6
+6 2250 750 3450 2625
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1200 3150 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1500 3150 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1800 3150 1800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2100 3150 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 975 3150 975 3150 2625 2550 2625 2550 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2400 3150 2400
+4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2400 2550 1350
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1875 2550 1050
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1425 2550 1950
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 900 2550 1650
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 900 1200 900
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1425 1200 1425
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1950 1200 1950
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2475 1200 2475
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2025 2550 2250
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2550 2550 2475
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1875 2850 1875 600 225 600 225 2850 1875 2850
+4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001
diff --git a/lustre/portals/doc/flow_new.fig b/lustre/portals/doc/flow_new.fig
new file mode 100644 (file)
index 0000000..d828dea
--- /dev/null
@@ -0,0 +1,213 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 525 2175 1575 2925
+6 675 2287 1425 2812
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001
+4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 2550 1050 2175 525 2550 1050 2925 1575 2550
+-6
+6 3450 1275 4350 1725
+6 3600 1312 4200 1687
+4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001
+4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3450 1275 4350 1275 4350 1725 3450 1725 3450 1275
+-6
+6 4650 1275 5550 1725
+6 4725 1312 5475 1687
+4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001
+4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4650 1275 5550 1275 5550 1725 4650 1725 4650 1275
+-6
+6 1350 525 2250 975
+6 1350 562 2250 937
+4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001
+4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 525 2250 525 2250 975 1350 975 1350 525
+-6
+6 525 1125 1575 1875
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 1500 1050 1125 525 1500 1050 1875 1575 1500
+4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001
+-6
+6 2340 1237 2940 1687
+6 2340 1237 2940 1687
+4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001
+4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001
+4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001
+-6
+-6
+6 525 3225 1575 3975
+6 675 3375 1425 3750
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        525 3600 1050 3225 1575 3600 1050 3975 525 3600
+-6
+6 3300 3375 4350 3825
+6 3300 3412 4350 3787
+4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3300 3375 4350 3375 4350 3825 3300 3825 3300 3375
+-6
+6 1950 3225 3000 3975
+6 2250 3450 2700 3750
+4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        3000 3600 2475 3225 1950 3600 2475 3975 3000 3600
+-6
+6 3150 4500 4200 4950
+6 3150 4537 4200 4912
+4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3150 4500 4200 4500 4200 4950 3150 4950 3150 4500
+-6
+6 600 4500 1500 4950
+6 675 4537 1425 4912
+4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001
+4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        600 4500 1500 4500 1500 4950 600 4950 600 4500
+-6
+6 4650 4350 5700 5100
+6 4950 4537 5400 4912
+6 4950 4537 5400 4912
+4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001
+4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001
+-6
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        5700 4725 5175 4350 4650 4725 5175 5100 5700 4725
+-6
+6 6000 4500 6900 4950
+6 6225 4575 6675 4875
+4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001
+4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        6000 4500 6900 4500 6900 4950 6000 4950 6000 4500
+-6
+6 1800 4350 2850 5100
+6 2100 4575 2550 4875
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        2850 4725 2325 4350 1800 4725 2325 5100 2850 4725
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 1875 1050 2175
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 1500 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 450 1050 1125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1350 750 1050 750
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 2925 1050 3225
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3150 1500 3450 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4350 1500 4650 1500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        2100 1500 2625 1125 3150 1500 2625 1875 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 3600 1950 3600
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 3975 1050 4500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 3600 3300 3600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 4725 1800 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        5700 4725 6000 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2850 4725 3150 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4200 4725 4650 4725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        6900 4725 7950 4725
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1575 2550 1650 2550 1800 2550 1800 2400 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        2250 750 2475 750 2625 750 2625 900 2625 1125
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        7500 4725 7500 1650 7500 1500 7350 1500 5550 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        2475 3225 2475 2400 2475 2250 2325 2250 1800 2250
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        3825 3375 3825 2175 3825 2025 3675 2025 1800 2025
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125
+        4425 4275 4425 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125
+        7275 4275 7275 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001
+4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001
diff --git a/lustre/portals/doc/get.fig b/lustre/portals/doc/get.fig
new file mode 100644 (file)
index 0000000..28db949
--- /dev/null
@@ -0,0 +1,33 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 2775 900 3525 1200
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001
+-6
+6 1350 1725 2175 2025
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 750
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 825 2700 1275
+2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1350 900 1950
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
diff --git a/lustre/portals/doc/ieee.bst b/lustre/portals/doc/ieee.bst
new file mode 100644 (file)
index 0000000..5367caa
--- /dev/null
@@ -0,0 +1,1114 @@
+% ---------------------------------------------------------------
+%
+% $Id: ieee.bst,v 1.1.2.1 2003/05/19 04:25:30 braam Exp $
+%
+% by Paolo.Ienne@di.epfl.ch
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to 
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+%
+% `ieee' from BibTeX standard bibliography style `abbrv'
+% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+% The file btxbst.doc has the documentation for this style.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+ { add.period$ write$
+   newline$
+   "\newblock " write$
+ }
+ { output.state before.all =
+     'write$
+     { add.period$ " " * write$ }
+   if$
+ }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+ { namesleft #1 >
+     { ", " * t * }
+     { numnames #2 >
+  { "," * }
+  'skip$
+       if$
+       t "others" =
+  { " et~al." * }
+  { " and " * t * }
+       if$
+     }
+   if$
+ }
+ 't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+ { ", editors" * }
+ { ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+     { "--" *
+       t #2 global.max$ substring$ 't :=
+     }
+     {   { t #1 #1 substring$ "-" = }
+  { "-" *
+    t #2 global.max$ substring$ 't :=
+  }
+       while$
+     }
+   if$
+ }
+ { t #1 #1 substring$ *
+   t #2 global.max$ substring$ 't :=
+ }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+ { "" }
+ { "there's a month but no year in " cite$ * warning$
+   month
+ }
+      if$
+    }
+    { month empty$
+ 'year
+ { month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+ 'skip$
+ { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+     { "number" }
+     { "Number" }
+   if$
+   number tie.or.space.connect
+   series empty$
+     { "there's a number but no series in " cite$ * warning$ }
+     { " in " * series * }
+   if$
+ }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+ { "pages" pages n.dashify tie.or.space.connect }
+ { "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+ { pop$ format.pages }
+ { ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+ 'skip$
+ { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+   warning$
+   ""
+ }
+ { "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+     { " et~al." * }
+     { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+   if$
+ }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { series empty$
+     { "need editor, key, or series for " cite$ * " to crossref " *
+       crossref * warning$
+       "" *
+     }
+     { "{\em " * series * "\/}" * }
+   if$
+ }
+ { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { booktitle empty$
+     { "need editor, key, or booktitle for " cite$ * " to crossref " *
+       crossref * warning$
+       ""
+     }
+     { "In {\em " booktitle * "\/}" * }
+   if$
+ }
+ { "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+ { organization publisher new.sentence.checkb
+   organization output
+   publisher output
+   format.date "year" output.check
+ }
+ { address output.nonnull
+   format.date "year" output.check
+   new.sentence
+   organization output
+   publisher output
+ }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+ 'skip$
+ { organization output.nonnull
+   address output
+ }
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+ { address new.block.checka
+   address output
+ }
+ 'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+ { publisher new.sentence.checka }
+ { organization publisher new.sentence.checkb
+   organization output
+ }
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+ 'skip$
+ { organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"Mar."}
+
+MACRO {apr} {"Apr."}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+MACRO {acmcs} {"ACM Comput. Surv."}
+
+MACRO {acta} {"Acta Inf."}
+
+MACRO {cacm} {"Commun. ACM"}
+
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+
+MACRO {ibmsj} {"IBM Syst.~J."}
+
+MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
+
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
+
+MACRO {ipl} {"Inf. Process. Lett."}
+
+MACRO {jacm} {"J.~ACM"}
+
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+
+MACRO {scp} {"Sci. Comput. Programming"}
+
+MACRO {sicomp} {"SIAM J. Comput."}
+
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+
+MACRO {tods} {"ACM Trans. Database Syst."}
+
+MACRO {tog} {"ACM Trans. Gr."}
+
+MACRO {toms} {"ACM Trans. Math. Softw."}
+
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+
+MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
+
+MACRO {tcs} {"Theoretical Comput. Sci."}
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+ { "   " * }
+ 'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+ { "et al" * }
+ { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+   ""
+ }
+ { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+ { key empty$
+     { "to sort, need author, editor, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need author, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need editor, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+     'author.organization.sort
+     'author.sort
+   if$
+ }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label }
+
+INTEGERS { number.label longest.label.width }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {longest.label.pass}
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * 
+  "}\setlength{\itemsep}{-1ex}\small" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
+
+% end of file ieee.bst
+% ---------------------------------------------------------------
diff --git a/lustre/portals/doc/mpi.fig b/lustre/portals/doc/mpi.fig
new file mode 100644 (file)
index 0000000..e1a91b5
--- /dev/null
@@ -0,0 +1,117 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 150 1650 900 2025
+4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001
+4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001
+-6
+6 150 150 900 525
+4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001
+4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001
+-6
+6 2550 4125 3150 4725
+4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001
+4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001
+4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001
+-6
+6 1050 1575 1950 1875
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 1575 1950 1575 1950 1875 1050 1875 1050 1575
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001
+-6
+6 5400 1575 6300 2175
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 1575 6300 1575 6300 2175 5400 2175 5400 1575
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001
+-6
+6 5400 2400 6300 3000
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 2400 6300 2400 6300 3000 5400 3000 5400 2400
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001
+-6
+6 1050 2400 1950 2700
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 2400 1950 2400 1950 2700 1050 2700 1050 2400
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001
+-6
+6 1050 825 1950 1125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 825 1950 825 1950 1125 1050 1125 1050 825
+4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1575
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2025 4050 3375
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 675 6600 675
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 1350 6600 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 4125 3300 4125 3300 4725 2400 4725 2400 4125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 4500 4050 3675
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 1725 5400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2550 5400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2850 4050 3450
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1800 1500 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 825 3300 825 3300 1275 2400 1275 2400 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 2625 1500 4125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 4125 1950 4125 1950 4425 1050 4425 1050 4125
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 300 1500 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 975 2400 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 1725 2400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 2550 2400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 4275 2400 4275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 1575 3300 1575 3300 2175 2400 2175 2400 1575
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 2400 3300 2400 3300 3000 2400 3000 2400 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4050 3300 5250 3300 5250 3750 4050 3750 4050 3300
+4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001
+4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001
+4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001
+4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001
+4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001
+4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001
+4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001
diff --git a/lustre/portals/doc/portals.fig b/lustre/portals/doc/portals.fig
new file mode 100644 (file)
index 0000000..9b1271b
--- /dev/null
@@ -0,0 +1,68 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 900 1650 900 1650 1200 1350 1200 1350 900
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1800 1350 2100 1350 2100 1650 1800 1650 1800 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2250 1800 2550 1800 2550 2100 2250 2100 2250 1800
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        4200 375 4200 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        525 600 1125 600 1125 2100 525 2100 525 600
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4425 1275 4875 1275 4875 1950 4425 1950 4425 1275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 1200 3150 1200 3150 1500 2550 1500 2550 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 1425 4425 1425
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3600 825 3750 825 3750 1125 3600 1125 3600 825
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2025 1425 2550 1425
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        4425 750 4875 750 4875 1125 4425 1125 4425 750
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3675 975 4425 975
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2
+       0 0 1.00 60.00 120.00
+        825 1050 1350 1050
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1350 1500 1500 1650 1500 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1950 1575 1950 1800 1950 1950 2100 1950 2250 1950
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 975 1125 975
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 1125 1125 1125
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7
+       0 0 1.00 60.00 120.00
+        3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975
+        3600 975
+        0.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001
+4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001
+4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001
+4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001
+4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001
+4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001
+4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001
diff --git a/lustre/portals/doc/portals3.bib b/lustre/portals/doc/portals3.bib
new file mode 100644 (file)
index 0000000..323b99f
--- /dev/null
@@ -0,0 +1,124 @@
+@Article{           Cplant,
+    title       = { {M}assively {P}arallel {C}omputing with
+                    {C}ommodity {C}omponents },
+    author      = { Ron Brightwell and David S. Greenberg and Arthur
+                    B. Maccabe and Rolf Riesen },
+    journal     = { Parallel Computing },
+    volume      = { 26 },
+    month       = { February },
+    pages       = { 243-266 },
+    year        = { 2000 }
+}
+
+@Manual{     Portals,
+    organization = { Sandia National Laboratories },
+    title        = { {P}uma {P}ortals },
+    note         = { http://www.cs.sandia.gov/puma/portals },
+    year         = { 1997 }
+}
+
+@Techreport{      VIA,
+  title         = { {V}irtual {I}nterface {A}rchitecture
+                    {S}pecification {V}ersion 1.0 }, 
+  author        = { {Compaq, Microsoft, and Intel} },
+  institution   = { Compaq, Microsoft, and Intel },
+  month         = { December },
+  year          = { 1997 }
+}
+
+@Techreport{      ST,
+  title         = { {I}nformation {T}echnology - {S}cheduled
+                  {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 },
+  author        = { {Task Group of Technical Committee T11} },
+  institution   = { Accredited Standards Committee NCITS },
+  month         = { July },
+  year          = { 1998 }
+}
+
+@Manual{     TFLOPS,
+    organization = { Sandia National Laboratories },
+    title        = { ASCI Red },
+    note         = { http://www.sandia.gov/ASCI/TFLOP },
+    year         = { 1996 }
+}
+
+@Techreport{      GM,
+  title         = { The {GM} {M}essage {P}assing {S}ystem },
+  author         = { {Myricom, Inc.} },
+  institution    = { {Myricom, Inc.} },
+  year          = { 1997 },
+}
+
+@Article{           MPIstandard,
+    title        = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard },
+    author       = { {Message Passing Interface Forum} },
+    journal      = { The International Journal of Supercomputer Applications
+                     and High Performance Computing },
+    volume       = { 8 },
+    year         = { 1994 }
+}
+
+@Inproceedings{    PumaOS,
+    author       = "Lance Shuler and Chu Jong and Rolf Riesen and
+                    David van Dresser and Arthur B. Maccabe and
+                    Lee Ann Fisk and T. Mack Stallcup",
+    booktitle    = "Proceeding of the 1995 Intel Supercomputer
+                    User's Group Conference",
+    title        = "The {P}uma Operating System for Massively Parallel Computers",
+    organization = "Intel Supercomputer User's Group",
+    year         = 1995
+}
+
+@InProceedings{   SUNMOS,
+author          = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and
+                   Stephen R. Wheat",
+title           = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide",
+booktitle       = "Proceedings of the {Intel} Supercomputer Users' Group. 1994
+                   Annual North America Users' Conference.",
+year            = 1994,
+pages           = "245--251",
+month           = "June",
+location        = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps"
+}
+
+@InProceedings {   PumaMPI,
+    title        = { Design and Implementation of {MPI} on {P}uma Portals },
+    author       = { Ron Brightwell and Lance Shuler },
+    booktitle    = { Proceedings of the Second MPI Developer's Conference },
+    pages        = { 18-25 },
+    month        = { July },
+    year         = { 1996 }
+}
+
+@Inproceedings{     FM2,
+    author       = { Mario Lauria and Scott Pakin and Andrew Chien },
+    title        = { {E}fficient {L}ayering for {H}igh {S}peed
+                     {C}ommunication: {F}ast {M}essages 2.x },
+    Booktitle    = { Proceedings of the IEEE International Symposium
+                     on High Performance Distributed Computing },
+    year         = { 1998 }
+}
+
+@Manual {          CraySHMEM,
+    title        = "SHMEM Technical Note for C, SG-2516 2.3",
+    organization = "Cray Research, Inc.",
+    month        = "October",
+    year         = 1994
+}
+
+@Manual {          MPI2,
+    title        = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface",
+    organization = "Message Passing Interface Forum",
+    note         = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html",
+    month        = "July",
+    year         = 1997
+}
+
+@InProceedings {   PMMPI,
+    title        = { {The Design and Implementation of Zero Copy MPI Using
+                       Commodity Hardware with a High Performance Network} },
+    author       = { Francis O'Carroll and  Hiroshi Tezuka and Atsushi Hori
+                     and Yutaka Ishikawa  },
+    booktitle    = { Proceedings of the ICS },
+    year         = { 1998 }
+}
diff --git a/lustre/portals/doc/portals3.lyx b/lustre/portals/doc/portals3.lyx
new file mode 100644 (file)
index 0000000..f3c24e0
--- /dev/null
@@ -0,0 +1,15946 @@
+#LyX 1.2 created this file. For more info see http://www.lyx.org/
+\lyxformat 220
+\textclass report
+\begin_preamble
+\usepackage{fullpage}
+\renewenvironment{comment}%
+{\begin{quote}\textbf{Discussion}: \slshape}%
+{\end{quote}}
+\pagestyle{myheadings}
+\markboth{$Revision: 1.1.2.1 $\hfil$Date: 2003/05/19 04:25:30 $}%
+{$Date: 2003/05/19 04:25:30 $\hfil$Revision: 1.1.2.1 $}
+\end_preamble
+\language american
+\inputencoding auto
+\fontscheme pslatex
+\graphics default
+\paperfontsize 10
+\spacing single 
+\papersize letterpaper
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 2
+\tocdepth 2
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 2
+\paperpagestyle headings
+
+\layout Title
+
+The Portals 3.2 Message Passing Interface 
+\newline 
+ Revision 1.1
+\layout Author
+
+Ron Brightwell
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+R.
+ Brightwell and R.
+ Riesen are with the Scalable Computing Systems Department, Sandia National
+ Laboratories, P.O.
+ Box 5800, Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov.
+\end_inset 
+
+, Arthur B.
+ Maccabe
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+A.
+ B.
+ Maccabe is with the Computer Science Department, University of New Mexico,
+ Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87131-1386, maccabe@cs.unm.edu.
+\end_inset 
+
+, Rolf Riesen and Trammell Hudson
+\layout Abstract
+
+This report presents a specification for the Portals 3.2 message passing
+ interface.
+ Portals 3.2 is intended to allow scalable, high-performance network communicatio
+n between nodes of a parallel computing system.
+ Specifically, it is designed to support a parallel computing platform composed
+ of clusters of commodity workstations connected by a commodity system area
+ network fabric.
+ In addition, Portals 3.2 is well suited to massively parallel processing
+ and embedded systems.
+ Portals 3.2 represents an adaption of the data movement layer developed
+ for massively parallel processing platforms, such as the 4500-node Intel
+ TeraFLOPS machine.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+clearpage
+\backslash 
+pagenumbering{roman}
+\backslash 
+setcounter{page}{3}
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset LatexCommand \tableofcontents{}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList figure
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList table
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Chapter*
+
+Summary of Changes for Revision 1.1
+\layout Enumerate
+
+Updated version number to 3.2 throughout the document
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sub:PtlGetId}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_SEGV
+\family default 
+ to error list for 
+\shape italic 
+PtlGetId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_ML_TOOLONG
+\family default 
+ to error list for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meunlink}
+
+\end_inset 
+
+: removed text referring to a list of associated memory descriptors.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added text to describe unlinking a free-floating memory descriptor.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added entry for 
+\family typewriter 
+ptl_seq_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+:
+\begin_deeper 
+\layout Enumerate
+
+added definition of 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+added text to clarify 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: modified text for 
+\family typewriter 
+unlink_op
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: added text to clarify multiple calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: added text to clarify 
+\family typewriter 
+unlink_nofit
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:receiving}
+
+\end_inset 
+
+: removed text indicating that an MD will reject a message if the associated
+ EQ is full.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ error code and text to indicate that only MDs with no pending operations
+ can be unlinked.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ return code.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added user id field, MD handle field, and NI specific failure field to
+ the 
+\family typewriter 
+ptl_event_t
+\family default 
+ structure.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_EVENT_UNLINK
+\family default 
+ event type.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: removed 
+\shape slanted 
+PtlTransId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+: listed allowable constants with relevant fields.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: added 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ function.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_PT_FULL
+\family default 
+ return code for 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+: updated to reflect new event types.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_nid_t
+\family default 
+, 
+\family typewriter 
+ptl_pid_t
+\family default 
+, and 
+\family typewriter 
+ptl_uid_t
+\family default 
+.
+\layout Chapter*
+
+Summary of Changes for Version 3.1
+\layout Section*
+
+Thread Issues
+\layout Standard
+
+The most significant change to the interface from version 3.0 to 3.1 involves
+ the clarification of how the interface interacts with multi-threaded applicatio
+ns.
+ We adopted a generic thread model in which processes define an address
+ space and threads share the address space.
+ Consideration of the API in the light of threads lead to several clarifications
+ throughout the document: 
+\layout Enumerate
+
+Glossary: 
+\begin_deeper 
+\layout Enumerate
+
+added a definition for 
+\emph on 
+thread
+\emph default 
+, 
+\layout Enumerate
+
+reworded the definition for 
+\emph on 
+process
+\emph default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+: added section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:threads}
+
+\end_inset 
+
+ to describe the multi-threading model used by the Portals API.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlfini}
+
+\end_inset 
+
+: 
+\emph on 
+PtlFini
+\emph default 
+ should be called once as the process is terminating and not as each thread
+ terminates.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+: Portals does not define thread ids.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+: network interfaces are associated with processes, not threads.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlNIInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqget}
+
+\end_inset 
+
+: 
+\emph on 
+PtlEQGet
+\emph default 
+ returns 
+\family typewriter 
+PTL_EQ_EMPTY
+\family default 
+ if a thread is blocked on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqwait}
+
+\end_inset 
+
+: waiting threads are awakened in FIFO order.
+\layout Standard
+
+Two functions, 
+\emph on 
+PtlNIBarrier
+\emph default 
+ and 
+\emph on 
+PtlEQCount
+\emph default 
+ were removed from the API.
+\emph on 
+PtlNIBarrier
+\emph default 
+ was defined to block the calling process until all of the processes in
+ the application group had invoked 
+\emph on 
+PtlNIBarrier
+\emph default 
+.
+ We now consider this functionality, along with the concept of groups (see
+ the discussion under 
+\begin_inset Quotes eld
+\end_inset 
+
+other changes
+\begin_inset Quotes erd
+\end_inset 
+
+), to be part of the runtime system, not part of the Portals API.
+\emph on 
+PtlEQCount
+\emph default 
+ was defined to return the number of events in an event queue.
+ Because external operations may lead to new events being added and other
+ threads may remove events, the value returned by 
+\emph on 
+PtlEQCount
+\emph default 
+ would have to be a hint about the number of events in the event queue.
+\layout Section*
+
+Handling small, unexpected messages
+\layout Standard
+
+Another set of changes relates to handling small unexpected messages in
+ MPI.
+ In designing version 3.0, we assumed that each unexpected message would
+ be placed in a unique memory descriptor.
+ To avoid the need to process a long list of memory descriptors, we moved
+ the memory descriptors out of the match list and hung them off of a single
+ match list entry.
+ In this way, large unexpected messages would only encounter a single 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry before encountering the 
+\begin_inset Quotes eld
+\end_inset 
+
+long message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry.
+ Experience with this strategy identified resource management problems with
+ this approach.
+ In particular, a long sequence of very short (or zero length) messages
+ could quickly exhaust the memory descriptors constructed for handling unexpecte
+d messages.
+ Our new strategy involves the use of several very large memory descriptors
+ for small unexpected messages.
+ Consecutive unexpected messages will be written into the first of these
+ memory descriptors until the memory descriptor fills up.
+ When the first of the 
+\begin_inset Quotes eld
+\end_inset 
+
+small memory
+\begin_inset Quotes erd
+\end_inset 
+
+ descriptors fills up, it will be unlinked and subsequent short messages
+ will be written into the next 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor.
+ In this case, a 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor will be declared full when it does not have sufficient
+ space for the largest small unexpected message.
+\layout Standard
+
+This lead to two significant changes.
+ First, each match list entry now has a single memory descriptor rather
+ than a list of memory descriptors.
+ Second, in addition to exceeding the operation threshold, a memory descriptor
+ can be unlinked when the local offset exceeds a specified value.
+ These changes have lead to several changes in this document: 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{subsec:paddress}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed references to the memory descriptor list, 
+\layout Enumerate
+
+changed the portals address translation description to indicate that unlinking
+ a memory descriptor implies unlinking the associated match list entry--match
+ list entries can no longer be unlinked independently from the memory descriptor.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed unlink from argument list, 
+\layout Enumerate
+
+removed description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+changed wording of the error condition when the Portal table index already
+ has an associated match list.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+: removed unlink from argument list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+added description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+removed reference to memory descriptor lists, 
+\layout Enumerate
+
+changed wording of the error condition when match list entry already has
+ an associated memory descriptor, 
+\layout Enumerate
+
+changed the description of the 
+\family typewriter 
+unlink
+\family default 
+ argument.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+: removed 
+\family typewriter 
+PtlMDInsert
+\family default 
+ operation.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: removed references to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: removed references to PtlMDInsert.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+: revised the MPI example to reflect the changes to the interface.
+\layout Standard
+
+Several changes have been made to improve the general documentation of the
+ interface.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_ID_ANY
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: documented the return value 
+\family typewriter 
+PTL_INV_EQ
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+: clarified the description of the 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:implvals}
+
+\end_inset 
+
+: introduced a new section to document the implementation defined values.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: modified Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ to indicate where each constant is introduced and where it is used.
+\layout Section*
+
+Other changes
+\layout Subsection*
+
+Implementation defined limits (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version provided implementation defined limits for the maximum
+ number of match entries, the maximum number of memory descriptors, etc.
+ Rather than spanning the entire implementation, these limits are now associated
+ with individual network interfaces.
+\layout Subsection*
+
+Added User Ids (Section 
+\begin_inset LatexCommand \ref{sec:uid}
+
+\end_inset 
+
+)
+\layout Standard
+
+Group Ids had been used to simplify access control entries.
+ In particular, a process could allow access for all of the processes in
+ a group.
+ User Ids have been introduced to regain this functionality.
+ We use user ids to fill this role.
+\layout Subsection*
+
+Removed Group Ids and Rank Ids (Section 
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version of Portals had two forms for addressing processes: <node
+ id, process id> and <group id, rank id>.
+ A process group was defined as the collection processes created during
+ application launch.
+ Each process in the group was given a unique rank id in the range 0 to
+\begin_inset Formula $n-1$
+\end_inset 
+
+ where 
+\begin_inset Formula $n$
+\end_inset 
+
+ was the number of processes in the group.
+ We removed groups because they are better handled in the runtime system.
+\layout Subsection*
+
+Match lists (Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+)
+\layout Standard
+
+It is no longer illegal to have an existing match entry when calling PtlMEAttach.
+ A position argument was added to the list of arguments supplied to 
+\emph on 
+PtlMEAttach
+\emph default 
+ to specify whether the new match entry is prepended or appended to the
+ existing list.
+ If there is no existing match list, the position argument is ignored.
+\layout Subsection*
+
+Unlinking Memory Descriptors (Section 
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, a memory descriptor could be unlinked if the offset exceeded
+ a threshold upon the completion of an operation.
+ In this version, the unlinking is delayed until there is a matching operation
+ which requires more memory than is currently available in the descriptor.
+ In addition to changes in section, this lead to a revision of Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+.
+\layout Subsection*
+
+Split Phase Operations and Events (Section 
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, there were five types of events: 
+\family typewriter 
+PTL_EVENT_PUT
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+, and 
+\family typewriter 
+PTL_EVENT_ACK.
+\family default 
+The first four of these reflected the completion of potentially long operations.
+ We have introduced new event types to reflect the fact that long operations
+ have a distinct starting point and a distinct completion point.
+ Moreover, the completion may be successful or unsuccessful.
+\layout Standard
+
+In addition to providing a mechanism for reporting failure to higher levels
+ of software, this split provides an opportunity for for improved ordering
+ semantics.
+ Previously, if one process intiated two operations (e.g., two put operations)
+ on a remote process, these operations were guaranteed to complete in the
+ same order that they were initiated.
+ Now, we only guarantee that the initiation events are delivered in the
+ same order.
+ In particular, the operations do not need to complete in the order that
+ they were intiated.
+\layout Subsection*
+
+Well known proces ids (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+To support the notion of 
+\begin_inset Quotes eld
+\end_inset 
+
+well known process ids,
+\begin_inset Quotes erd
+\end_inset 
+
+ we added a process id argument to the arguments for PtlNIInit.
+\layout Chapter*
+
+Glossary
+\layout Description
+
+API Application Programming Interface.
+ A definition of the functions and semantics provided by library of functions.
+\layout Description
+
+Initiator A 
+\emph on 
+process
+\emph default 
+ that initiates a message operation.
+\layout Description
+
+Message An application-defined unit of data that is exchanged between 
+\emph on 
+processes
+\emph default 
+.
+\layout Description
+
+Message\SpecialChar ~
+Operation Either a put operation, which writes data, or a get operation,
+ which reads data.
+\layout Description
+
+Network A network provides point-to-point communication between 
+\emph on 
+nodes
+\emph default 
+.
+ Internally, a network may provide multiple routes between endpoints (to
+ improve fault tolerance or to improve performance characteristics); however,
+ multiple paths will not be exposed outside of the network.
+\layout Description
+
+Node A node is an endpoint in a 
+\emph on 
+network
+\emph default 
+.
+ Nodes provide processing capabilities and memory.
+ A node may provide multiple processors (an SMP node) or it may act as a
+\emph on 
+gateway
+\emph default 
+ between networks.
+\layout Description
+
+Process A context of execution.
+ A process defines a virtual memory (VM) context.
+ This context is not shared with other processes.
+ Several threads may share the VM context defined by a process.
+\layout Description
+
+Target A 
+\emph on 
+process
+\emph default 
+ that is acted upon by a message operation.
+\layout Description
+
+Thread A context of execution that shares a VM context with other threads.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\layout Standard
+
+\backslash 
+setcounter{page}{1}
+\backslash 
+pagenumbering{arabic}
+\end_inset 
+
+
+\layout Chapter
+
+Introduction
+\begin_inset LatexCommand \label{sec:intro}
+
+\end_inset 
+
+
+\layout Section
+
+Overview
+\layout Standard
+
+This document describes an application programming interface for message
+ passing between nodes in a system area network.
+ The goal of this interface is to improve the scalability and performance
+ of network communication by defining the functions and semantics of message
+ passing required for scaling a parallel computing system to ten thousand
+ nodes.
+ This goal is achieved by providing an interface that will allow a quality
+ implementation to take advantage of the inherently scalable design of Portals.
+\layout Standard
+
+This document is divided into several sections: 
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:intro}
+
+\end_inset 
+
+---Introduction This section describes the purpose and scope of the Portals
+ API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+---An\SpecialChar ~
+Overview\SpecialChar ~
+of\SpecialChar ~
+the\SpecialChar ~
+Portals\SpecialChar ~
+3.1\SpecialChar ~
+API This section gives a brief overview of the
+ Portals API.
+ The goal is to introduce the key concepts and terminology used in the descripti
+on of the API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:api}
+
+\end_inset 
+
+---The\SpecialChar ~
+Portals\SpecialChar ~
+3.2\SpecialChar ~
+API This section describes the functions and semantics of
+ the Portals application programming interface.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+--The\SpecialChar ~
+Semantics\SpecialChar ~
+of\SpecialChar ~
+Message\SpecialChar ~
+Transmission This section describes the semantics
+ of message transmission.
+ In particular, the information transmitted in each type of message and
+ the processing of incoming messages.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:examples}
+
+\end_inset 
+
+---Examples This section presents several examples intended to illustrates
+ the use of the Portals API.
+\layout Section
+
+Purpose
+\layout Standard
+
+Existing message passing technologies available for commodity cluster networking
+ hardware do not meet the scalability goals required by the Cplant\SpecialChar ~
+
+\begin_inset LatexCommand \cite{Cplant}
+
+\end_inset 
+
+ project at Sandia National Laboratories.
+ The goal of the Cplant project is to construct a commodity cluster that
+ can scale to the order of ten thousand nodes.
+ This number greatly exceeds the capacity for which existing message passing
+ technologies have been designed and implemented.
+\layout Standard
+
+In addition to the scalability requirements of the network, these technologies
+ must also be able to support a scalable implementation of the Message Passing
+ Interface (MPI)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPIstandard}
+
+\end_inset 
+
+ standard, which has become the 
+\shape italic 
+de facto
+\shape default 
+ standard for parallel scientific computing.
+ While MPI does not impose any scalability limitations, existing message
+ passing technologies do not provide the functionality needed to allow implement
+ations of MPI to meet the scalability requirements of Cplant.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ any inherent scalability limitations: 
+\layout Itemize
+
+Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+ and TCP/IP sockets, have limitations on the number of peer connections
+ that can be established.
+\layout Itemize
+
+Network independence - Many communication systems depend on the host processor
+ to perform operations in order for messages in the network to be consumed.
+ Message consumption from the network should not be dependent on host processor
+ activity, such as the operating system scheduler or user-level thread scheduler.
+\layout Itemize
+
+User-level flow control - Many communication systems manage flow control
+ internally to avoid depleting resources, which can significantly impact
+ performance as the number of communicating processes increases.
+\layout Itemize
+
+OS Bypass - High performance network communication should not involve memory
+ copies into or out of a kernel-managed protocol stack.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ scalability limitations for an implementation of MPI:
+\layout Itemize
+
+Receiver-managed - Sender-managed message passing implementations require
+ a persistent block of memory to be available for every process, requiring
+ memory resources to increase with job size and requiring user-level flow
+ control mechanisms to manage these resources.
+\layout Itemize
+
+User-level Bypass - While OS Bypass is necessary for high-performance, it
+ alone is not sufficient to support the Progress Rule of MPI asynchronous
+ operations.
+\layout Itemize
+
+Unexpected messages - Few communication systems have support for receiving
+ messages for which there is no prior notification.
+ Support for these types of messages is necessary to avoid flow control
+ and protocol overhead.
+\layout Section
+
+Background
+\layout Standard
+
+Portals was originally designed for and implemented on the nCube machine
+ as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{SUNMOS}
+
+\end_inset 
+
+ and Puma\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaOS}
+
+\end_inset 
+
+ lightweight kernel development projects.
+ Portals went through two design phases, the latter of which is used on
+ the 4500-node Intel TeraFLOPS machine\SpecialChar ~
+
+\begin_inset LatexCommand \cite{TFLOPS}
+
+\end_inset 
+
+.
+ Portals have been very successful in meeting the needs of such a large
+ machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaMPI}
+
+\end_inset 
+
+, but also for implementing the scalable run-time environment and parallel
+ I/O capabilities of the machine.
+\layout Standard
+
+The second generation Portals implementation was designed to take full advantage
+ of the hardware architecture of large MPP machines.
+ However, efforts to implement this same design on commodity cluster technology
+ identified several limitations, due to the differences in network hardware
+ as well as to shortcomings in the design of Portals.
+\layout Section
+
+Scalability
+\layout Standard
+
+The primary goal in the design of Portals is scalability.
+ Portals are designed specifically for an implementation capable of supporting
+ a parallel job running on tens of thousands of nodes.
+ Performance is critical only in terms of scalability.
+ That is, the level of message passing performance is characterized by how
+ far it allows an application to scale and not by how it performs in micro-bench
+marks (e.g., a two node bandwidth or latency test).
+\layout Standard
+
+The Portals API is designed to allow for scalability, not to guarantee it.
+ Portals cannot overcome the shortcomings of a poorly designed application
+ program.
+ Applications that have inherent scalability limitations, either through
+ design or implementation, will not be transformed by Portals into scalable
+ applications.
+ Scalability must be addressed at all levels.
+ Portals do not inhibit scalability, but do not guarantee it either.
+\layout Standard
+
+To support scalability, the Portals interface maintains a minimal amount
+ of state.
+ Portals provide reliable, ordered delivery of messages between pairs of
+ processes.
+ They are connectionless: a process is not required to explicitly establish
+ a point-to-point connection with another process in order to communicate.
+ Moreover, all buffers used in the transmission of messages are maintained
+ in user space.
+ The target process determines how to respond to incoming messages, and
+ messages for which there are no buffers are discarded.
+\layout Section
+
+Communication Model
+\layout Standard
+
+Portals combine the characteristics of both one-side and two-sided communication.
+ They define a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching put
+\begin_inset Quotes erd
+\end_inset 
+
+ operation and a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching get
+\begin_inset Quotes erd
+\end_inset 
+
+ operation.
+ The destination of a put (or send) is not an explicit address; instead,
+ each message contains a set of match bits that allow the receiver to determine
+ where incoming messages should be placed.
+ This flexibility allows Portals to support both traditional one-sided operation
+s and two-sided send/receive operations.
+\layout Standard
+
+Portals allows the target to determine whether incoming messages are acceptable.
+ A target process can choose to accept message operations from any specific
+ process or can choose to ignore message operations from any specific process.
+\layout Section
+
+Zero Copy, OS Bypass and Application Bypass
+\layout Standard
+
+In traditional system architectures, network packets arrive at the network
+ interface card (NIC), are passed through one or more protocol layers in
+ the operating system, and eventually copied into the address space of the
+ application.
+ As network bandwidth began to approach memory copy rates, reduction of
+ memory copies became a critical concern.
+ This concern lead to the development of zero-copy message passing protocols
+ in which message copies are eliminated or pipelined to avoid the loss of
+ bandwidth.
+\layout Standard
+
+A typical zero-copy protocol has the NIC generate an interrupt for the CPU
+ when a message arrives from the network.
+ The interrupt handler then controls the transfer of the incoming message
+ into the address space of the appropriate application.
+ The interrupt latency, the time from the initiation of an interrupt until
+ the interrupt handler is running, is fairly significant.
+ To avoid this cost, some modern NICs have processors that can be programmed
+ to implement part of a message passing protocol.
+ Given a properly designed protocol, it is possible to program the NIC to
+ control the transfer of incoming messages, without needing to interrupt
+ the CPU.
+ Because this strategy does not need to involve the OS on every message
+ transfer, it is frequently called 
+\begin_inset Quotes eld
+\end_inset 
+
+OS Bypass.
+\begin_inset Quotes erd
+\end_inset 
+
+ ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+, FM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{FM2}
+
+\end_inset 
+
+, GM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{GM}
+
+\end_inset 
+
+, and Portals are examples of OS Bypass protocols.
+\layout Standard
+
+Many protocols that support OS Bypass still require that the application
+ actively participate in the protocol to ensure progress.
+ As an example, the long message protocol of PM requires that the application
+ receive and reply to a request to put or get a long message.
+ This complicates the runtime environment, requiring a thread to process
+ incoming requests, and significantly increases the latency required to
+ initiate a long message protocol.
+ The Portals message passing protocol does not require activity on the part
+ of the application to ensure progress.
+ We use the term 
+\begin_inset Quotes eld
+\end_inset 
+
+Application Bypass
+\begin_inset Quotes erd
+\end_inset 
+
+ to refer to this aspect of the Portals protocol.
+\layout Section
+
+Faults 
+\layout Standard
+
+Given the number of components that we are dealing with and the fact that
+ we are interested in supporting applications that run for very long times,
+ failures are inevitable.
+ The Portals API recognizes that the underlying transport may not be able
+ to successfully complete an operation once it has been initiated.
+ This is reflected in the fact that the Portals API reports three types
+ of events: events indicating the initiation of an operation, events indicating
+ the successful completion of an operation, and events indicating the unsuccessf
+ul completion of an operation.
+ Every initiation event is eventually followed by a successful completion
+ event or an unsuccessful completion event.
+\layout Standard
+
+Between the time an operation is started and the time that the operation
+ completes (successfully or unsuccessfully), any memory associated with
+ the operation should be considered volatile.
+ That is, the memory may be changed in unpredictable ways while the operation
+ is progressing.
+ Once the operation completes, the memory associated with the operation
+ will not be subject to further modification (from this operation).
+ Notice that unsuccessful operations may alter memory in an essentially
+ unpredictable fashion.
+\layout Chapter
+
+An Overview of the Portals API
+\begin_inset LatexCommand \label{sec:apiover}
+
+\end_inset 
+
+
+\layout Standard
+
+In this section, we give a conceptual overview of the Portals API.
+ The goal is to provide a context for understanding the detailed description
+ of the API presented in the next section.
+\layout Section
+
+Data Movement
+\begin_inset LatexCommand \label{sec:dmsemantics}
+
+\end_inset 
+
+
+\layout Standard
+
+A Portal represents an opening in the address space of a process.
+ Other processes can use a Portal to read (get) or write (put) the memory
+ associated with the portal.
+ Every data movement operation involves two processes, the 
+\series bold 
+initiator
+\series default 
+ and the 
+\series bold 
+target
+\series default 
+.
+ The initiator is the process that initiates the data movement operation.
+ The target is the process that responds to the operation by either accepting
+ the data for a put operation, or replying with the data for a get operation.
+\layout Standard
+
+In this discussion, activities attributed to a process may refer to activities
+ that are actually performed by the process or 
+\emph on 
+on behalf of the process
+\emph default 
+.
+ The inclusiveness of our terminology is important in the context of 
+\emph on 
+application bypass
+\emph default 
+.
+ In particular, when we note that the target sends a reply in the case of
+ a get operation, it is possible that reply will be generated by another
+ component in the system, bypassing the application.
+\layout Standard
+
+Figures\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:put}
+
+\end_inset 
+
+ and 
+\begin_inset LatexCommand \ref{fig:get}
+
+\end_inset 
+
+ present graphical interpretations of the Portal data movement operations:
+ put and get.
+ In the case of a put operation, the initiator sends a put request message
+ containing the data to the target.
+ The target translates the Portal addressing information in the request
+ using its local Portal structures.
+ When the request has been processed, the target optionally sends an acknowledge
+ment message.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename put.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Put (Send)
+\begin_inset LatexCommand \label{fig:put}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+In the case of a get operation, the initiator sends a get request to the
+ target.
+ As with the put operation, the target translates the Portal addressing
+ information in the request using its local Portal structures.
+ Once it has translated the Portal addressing information, the target sends
+ a reply that includes the requested data.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename get.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Get
+\begin_inset LatexCommand \label{fig:get}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+We should note that Portal address translations are only performed on nodes
+ that respond to operations initiated by other nodes.
+ Acknowledgements and replies to get operations bypass the portals address
+ translation structures.
+\layout Section
+
+Portal Addressing
+\begin_inset LatexCommand \label{subsec:paddress}
+
+\end_inset 
+
+
+\layout Standard
+
+One-sided data movement models (e.g., shmem\SpecialChar ~
+
+\begin_inset LatexCommand \cite{CraySHMEM}
+
+\end_inset 
+
+, ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, MPI-2\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPI2}
+
+\end_inset 
+
+) typically use a triple to address memory on a remote node.
+ This triple consists of a process id, memory buffer id, and offset.
+ The process id identifies the target process, the memory buffer id specifies
+ the region of memory to be used for the operation, and the offset specifies
+ an offset within the memory buffer.
+\layout Standard
+
+In addition to the standard address components (process id, memory buffer
+ id, and offset), a Portal address includes a set of match bits.
+ This addressing model is appropriate for supporting one-sided operations
+ as well as traditional two-sided message passing operations.
+ Specifically, the Portals API provides the flexibility needed for an efficient
+ implementation of MPI-1, which defines two-sided operations with one-sided
+ completion semantics.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:portals}
+
+\end_inset 
+
+ presents a graphical representation of the structures used by a target
+ in the interpretation of a Portal address.
+ The process id is used to route the message to the appropriate node and
+ is not reflected in this diagram.
+ The memory buffer id, called the 
+\series bold 
+portal id
+\series default 
+, is used as an index into the Portal table.
+ Each element of the Portal table identifies a match list.
+ Each element of the match list specifies two bit patterns: a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+don't care
+\begin_inset Quotes erd
+\end_inset 
+
+ bits, and a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+must match
+\begin_inset Quotes erd
+\end_inset 
+
+ bits.
+ In addition to the two sets of match bits, each match list element has
+ at most one memory descriptor.
+ Each memory descriptor identifies a memory region and an optional event
+ queue.
+ The memory region specifies the memory to be used in the operation and
+ the event queue is used to record information about these operations.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename portals.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 305pt
+       lyxheight 106pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Addressing Structures
+\begin_inset LatexCommand \label{fig:portals}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+ illustrates the steps involved in translating a Portal address, starting
+ from the first element in a match list.
+ If the match criteria specified in the match list entry are met and the
+ memory descriptor list accepts the operation
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Memory descriptors can reject operations because a threshold has been exceeded
+ or because the memory region does not have sufficient space, see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+
+, the operation (put or get) is performed using the memory region specified
+ in the memory descriptor.
+ If the memory descriptor specifies that it is to be unlinked when a threshold
+ has been exceeded, the match list entry is removed from the match list
+ and the resources associated with the memory descriptor and match list
+ entry are reclaimed.
+ Finally, if there is an event queue specified in the memory descriptor,
+ the operation is logged in the event queue.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename flow_new.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 447pt
+       lyxheight 282pt
+\end_inset 
+
+
+\layout Caption
+
+Portals Address Translation
+\begin_inset LatexCommand \label{fig:flow}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+If the match criteria specified in the match list entry are not met, or
+ there is no memory descriptor associated with the match list entry, or
+ the memory descriptor associated with the match list entry rejects the
+ operation, the address translation continues with the next match list entry.
+ If the end of the match list has been reached, the address translation
+ is aborted and the incoming requested is discarded.
+\layout Section
+
+Access Control
+\layout Standard
+
+A process can control access to its portals using an access control list.
+ Each entry in the access control list specifies a process id and a Portal
+ table index.
+ The access control list is actually an array of entries.
+ Each incoming request includes an index into the access control list (i.e.,
+ a 
+\begin_inset Quotes eld
+\end_inset 
+
+cookie
+\begin_inset Quotes erd
+\end_inset 
+
+ or hint).
+ If the id of the process issuing the request doesn't match the id specified
+ in the access control list entry or the Portal table index specified in
+ the request doesn't match the Portal table index specified in the access
+ control list entry, the request is rejected.
+ Process identifiers and Portal table indexes may include wild card values
+ to increase the flexibility of this mechanism.
+\layout Standard
+
+Two aspects of this design merit further discussion.
+ First, the model assumes that the information in a message header, the
+ sender's id in particular, is trustworthy.
+ In most contexts, we assume that the entity that constructs the header
+ is trustworthy; however, using cryptographic techniques, we could easily
+ devise a protocol that would ensure the authenticity of the sender.
+\layout Standard
+
+Second, because the access check is performed by the receiver, it is possible
+ that a malicious process will generate thousands of messages that will
+ be denied by the receiver.
+ This could saturate the network and/or the receiver, resulting in a 
+\emph on 
+denial of service
+\emph default 
+ attack.
+ Moving the check to the sender using capabilities, would remove the potential
+ for this form of attack.
+ However, the solution introduces the complexities of capability management
+ (exchange of capabilities, revocation, protections, etc).
+\layout Section
+
+Multi-threaded Applications
+\begin_inset LatexCommand \label{sec:threads}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports a generic view of multi-threaded applications.
+ From the perspective of the Portals API, an application program is defined
+ by a set of processes.
+ Each process defines a unique address space.
+ The Portals API defines access to this address space from other processes
+ (using portals addressing and the data movement operations).
+ A process may have one or more 
+\emph on 
+threads
+\emph default 
+ executing in its address space.
+\layout Standard
+
+With the exception of 
+\emph on 
+PtlEQWait
+\emph default 
+ every function in the Portals API is non-blocking and atomic with respect
+ to both other threads and external operations that result from data movement
+ operations.
+ While individual operations are atomic, sequences of these operations may
+ be interleaved between different threads and with external operations.
+ The Portals API does not provide any mechanisms to control this interleaving.
+ It is expected that these mechanisms will be provided by the API used to
+ create threads.
+\layout Chapter
+
+The Portals API
+\begin_inset LatexCommand \label{sec:api}
+
+\end_inset 
+
+
+\layout Section
+
+Naming Conventions
+\begin_inset LatexCommand \label{sec:conv}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API defines two types of entities: functions and types.
+ Function always start with 
+\emph on 
+Ptl
+\emph default 
+ and use mixed upper and lower case.
+ When used in the body of this report, function names appear in italic face,
+ e.g., 
+\emph on 
+PtlInit
+\emph default 
+.
+ The functions associated with an object type will have names that start
+ with 
+\emph on 
+Ptl
+\emph default 
+, followed by the two letter object type code shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ As an example, the function 
+\emph on 
+PtlEQAlloc
+\emph default 
+ allocates resources for an event queue.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Object Type Codes
+\begin_inset LatexCommand \label{tab:objcodes}
+
+\end_inset 
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+\backslash 
+medskip
+\newline 
+  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\emph on 
+xx
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+EQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Event Queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Memory Descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ ME 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Match list Entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Network Interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Type names use lower case with underscores to separate words.
+ Each type name starts with 
+\family typewriter 
+ptl
+\family default 
+_ and ends with 
+\family typewriter 
+_t
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+.
+\layout Standard
+
+Names for constants use upper case with underscores to separate words.
+ Each constant name starts with 
+\family typewriter 
+PTL_
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+PTL_OK
+\family default 
+.
+\layout Section
+
+Base Types
+\layout Standard
+
+The Portals API defines a variety of base types.
+ These types represent a simple renaming of the base types provided by the
+ C programming language.
+ In most cases these new type names have been introduced to improve type
+ safety and to avoid issues arising from differences in representation sizes
+ (e.g., 16-bit or 32-bit integers).
+\layout Subsection
+
+Sizes
+\begin_inset LatexCommand \label{sec:size-t}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_size_t
+\family default 
+ is an unsigned 64-bit integral type used for representing sizes.
+\layout Subsection
+
+Handles
+\begin_inset LatexCommand \label{sec:handle-type}
+
+\end_inset 
+
+\layout Standard
+
+Objects maintained by the API are accessed through handles.
+ Handle types have names of the form 
+\family typewriter 
+ptl_handle_
+\emph on 
+xx
+\emph default 
+_t
+\family default 
+, where 
+\emph on 
+xx
+\emph default 
+ is one of the two letter object type codes shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ For example, the type 
+\family typewriter 
+ptl_handle_ni_t
+\family default 
+ is used for network interface handles.
+\layout Standard
+
+Each type of object is given a unique handle type to enhance type checking.
+ The type, 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+, can be used when a generic handle is needed.
+ Every handle value can be converted into a value of type 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+ without loss of information.
+\layout Standard
+
+Handles are not simple values.
+ Every portals object is associated with a specific network interface and
+ an identifier for this interface (along with an object identifier) is part
+ of the handle for the object.
+\layout Standard
+
+The special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, of type 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+, is used to indicate the absence of an event queue.
+ See sections 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+ for uses of this value.
+\layout Subsection
+
+Indexes
+\begin_inset LatexCommand \label{sec:index-type}
+
+\end_inset 
+
+\layout Standard
+
+The types 
+\family typewriter 
+ptl_pt_index_t
+\family default 
+ and 
+\family typewriter 
+ptl_ac_index_t
+\family default 
+ are integral types used for representing Portal table indexes and access
+ control tables indexes, respectively.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+ for limits on values of these types.
+\layout Subsection
+
+Match Bits
+\begin_inset LatexCommand \label{sec:mb-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+ is capable of holding unsigned 64-bit integer values.
+\layout Subsection
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_interface_t
+\family default 
+ is an integral type used for identifying different network interfaces.
+ Users will need to consult the local documentation to determine appropriate
+ values for the interfaces available.
+ The special value 
+\family typewriter 
+PTL_IFACE_DEFAULT
+\family default 
+ identifies the default interface.
+\layout Subsection
+
+Identifiers
+\begin_inset LatexCommand \label{sec:id-type}
+
+\end_inset 
+
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_nid_t
+\family default 
+ is an integral type used for representing node ids
+\family typewriter 
+, ptl_pid_t
+\family default 
+ is an integral type for representing process ids, and 
+\family typewriter 
+ptl_uid_t 
+\family default 
+is an integral type for representing user ids.
+\layout Standard
+
+The special values 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ matches any process identifier, PTL_NID_ANY matches any node identifier,
+ and 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ matches any user identifier.
+ See sections 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+ for uses of these values.
+\layout Subsection
+
+Status Registers
+\begin_inset LatexCommand \label{sec:stat-type}
+
+\end_inset 
+
+
+\layout Standard
+
+Each network interface maintains an array of status registers that can be
+ accessed using the 
+\family typewriter 
+PtlNIStatus
+\family default 
+ function (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ The type 
+\family typewriter 
+ptl_sr_index_t
+\family default 
+ defines the types of indexes that can be used to access the status registers.
+ The only index defined for all implementations is 
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+ which identifies the status register that counts the dropped requests for
+ the interface.
+ Other indexes (and registers) may be defined by the implementation.
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_sr_value_t
+\family default 
+ defines the types of values held in status registers.
+ This is a signed integer type.
+ The size is implementation dependent, but must be at least 32 bits.
+\layout Section
+
+Initialization and Cleanup
+\begin_inset LatexCommand \label{sec:init}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API includes a function, 
+\emph on 
+PtlInit
+\emph default 
+, to initialize the library and a function, 
+\emph on 
+PtlFini
+\emph default 
+, to cleanup after the application is done using the library.
+\layout Subsection
+
+PtlInit
+\begin_inset LatexCommand \label{sec:ptlinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlInit( int *max_interfaces );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlInit
+\emph default 
+ function initializes the Portals library.
+ PtlInit must be called at least once by a process before any thread makes
+ a Portals function call, but may be safely called more than once.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_FAIL Indicates an error during initialization.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+max_interfaces
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+max_interfaces
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the maximum number of interfaces
+ that can be initialized.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlFini
+\begin_inset LatexCommand \label{sec:ptlfini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+void PtlFini( void );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlFini
+\emph default 
+ function cleans up after the Portals library is no longer needed by a process.
+ After this function is called, calls to any of the functions defined by
+ the Portal API or use of the structures set up by the Portals API will
+ result in undefined behavior.
+ This function should be called once and only once during termination by
+ a process.
+ Typically, this function will be called in the exit sequence of a process.
+ Individual threads should not call PtlFini when they terminate.
+\layout Section
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports the use of multiple network interfaces.
+ However, each interface is treated as an independent entity.
+ Combining interfaces (e.g., 
+\begin_inset Quotes eld
+\end_inset 
+
+bonding
+\begin_inset Quotes erd
+\end_inset 
+
+ to create a higher bandwidth connection) must be implemented by the application
+ or embedded in the underlying network.
+ Interfaces are treated as independent entities to make it easier to cache
+ information on individual network interface cards.
+\layout Standard
+
+Once initialized, each interface provides a Portal table, an access control
+ table, and a collection of status registers.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for a discussion of updating Portal table entries using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ function.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+ for a discussion of the initialization and updating of entries in the access
+ control table.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+ for a discussion of the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function which can be used to determine the value of a status register.
+\layout Standard
+
+Every other type of Portal object (e.g., memory descriptor, event queue, or
+ match list entry) is associated with a specific network interface.
+ The association to a network interface is established when the object is
+ created and is encoded in the handle for the object.
+\layout Standard
+
+Each network interface is initialized and shutdown independently.
+ The initialization routine, 
+\emph on 
+PtlNIInit
+\emph default 
+, returns a handle for an interface object which is used in all subsequent
+ Portal operations.
+ The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to shutdown an interface and release any resources that
+ are associated with the interface.
+ Network interface handles are associated with processes, not threads.
+ All threads in a process share all of the network interface handles.
+\layout Standard
+
+The Portals API also defines the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function to query the status registers for a network interface, the 
+\emph on 
+PtlNIDist
+\emph default 
+ function to determine the 
+\begin_inset Quotes eld
+\end_inset 
+
+distance
+\begin_inset Quotes erd
+\end_inset 
+
+ to another process, and the 
+\emph on 
+PtlNIHandle
+\emph default 
+ function to determine the network interface that an object is associated
+ with.
+\layout Subsection
+
+PtlNIInit
+\begin_inset LatexCommand \label{sec:niinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    int            max_match_entries;
+\newline 
+    int            max_mem_descriptors;
+\newline 
+    int            max_event_queues;
+\newline 
+    ptl_ac_index_t max_atable_index; 
+\newline 
+    ptl_pt_index_t max_ptable_index;
+\newline 
+} ptl_ni_limits_t;
+\newline 
+
+\newline 
+int PtlNIInit( ptl_interface_t  interface
+\newline 
+               ptl_pid_t        pid,
+\newline 
+               ptl_ni_limits_t* desired,
+\newline 
+               ptl_ni_limits_t* actual,
+\newline 
+               ptl_handle_ni_t* handle );
+\layout Standard
+
+Values of type 
+\family typewriter 
+ptl_ni_limits_t
+\family default 
+ include the following members:
+\layout Description
+
+max_match_entries Maximum number of match entries that can be allocated
+ at any one time.
+\layout Description
+
+max_mem_descriptors Maximum number of memory descriptors that can be allocated
+ at any one time.
+\layout Description
+
+max_event_queues Maximum number of event queues that can be allocated at
+ any one time.
+\layout Description
+
+max_atable_index Largest access control table index for this interface,
+ valid indexes range from zero to 
+\family typewriter 
+max_atable_index
+\family default 
+, inclusive.
+\layout Description
+
+max_ptable_index Largest Portal table index for this interface, valid indexes
+ range from zero to 
+\family typewriter 
+max_ptable_index
+\family default 
+, inclusive.
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIInit
+\emph default 
+ function is used to initialized the Portals API for a network interface.
+ This function must be called at least once by each process before any other
+ operations that apply to the interface by any process or thread.
+ For subsequent calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+ from within the same process (either by different threads or the same thread),
+ the desired limits will be ignored and the call will return the existing
+ NI handle.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INIT_DUP Indicates a duplicate initialization of 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INIT_INV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to initialize the
+ interface.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+pid
+\family default 
+ is not a valid process id.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+actual 
+\family default 
+or
+\family typewriter 
+ handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the network interface to be initialized.
+  (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+ for a discussion of  values used to identify network interfaces.)
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+pid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the desired process id (for well known process ids).
+ The value 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ may be used to have the process id assigned by the underlying library.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+desired
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If non-NULL, points to a structure that holds the desired limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+actual
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, the location pointed to by actual will hold the actual
+ limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the interface.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The use of desired is implementation dependent.
+ In particular, an implementation may choose to ignore this argument.
+\layout Subsection
+
+PtlNIFini
+\begin_inset LatexCommand \label{sec:nifini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIFini( ptl_handle_ni_t interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to release the resources allocated for a network interface.
+ Once the 
+\emph on 
+PtlNIFini
+\emph default 
+ operation has been started, the results of pending API operations (e.g.,
+ operations initiated by another thread) for this interface are undefined.
+ Similarly, the effects of incoming operations (puts and gets) or return
+ values (acknowledgements and replies) for this interface are undefined.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the interface to shutdown.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlNIStatus
+\begin_inset LatexCommand \label{sec:nistatus}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIStatus( ptl_handle_ni_t interface,
+\newline 
+                 ptl_sr_index_t  status_register,
+\newline 
+                 ptl_sr_value_t* status );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIStatus
+\emph default 
+ function returns the value of a status register for the specified interface.
+ (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+ for more information on status register indexes and status register values.)
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_SR_INDX Indicates that 
+\family typewriter 
+status_register
+\family default 
+ is not a valid status register.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+status
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status_register
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An index for the status register to read.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the current value of the status
+ register.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The only status register that must be defined is a drop count register (
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+).
+ Implementations may define additional status registers.
+ Identifiers for the indexes associated with these registers should start
+ with the prefix 
+\family typewriter 
+PTL_SR_
+\family default 
+.
+\layout Subsection
+
+PtlNIDist
+\layout LyX-Code
+
+int PtlNIDist( ptl_handle_ni_t  interface,
+\newline 
+               ptl_process_id_t process,
+\newline 
+               unsigned long*   distance );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIDist
+\emph default 
+ function returns the distance to another process using the specified interface.
+ Distances are only defined relative to an interface.
+ Distance comparisons between different interfaces on the same process may
+ be meaningless.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+process
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+distance
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+process
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An identifier for the process whose distance is being  requested.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+distance
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  distance to the remote
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+This function should return a static measure of distance.
+ Examples include minimum latency, the inverse of available bandwidth, or
+ the number of switches between the two endpoints.
+\layout Subsection
+
+PtlNIHandle
+\layout LyX-Code
+
+int PtlNIHandle( ptl_handle_any_t handle,
+\newline 
+                 ptl_handle_ni_t* interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIHandle
+\emph default 
+ function returns a handle for the network interface with which the object
+ identified by 
+\family typewriter 
+handle
+\family default 
+ is associated.
+ If the object identified by 
+\family typewriter 
+handle
+\family default 
+ is a network interface, this function returns the same value it is passed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_HANDLE Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a valid handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the object.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the network interface
+ associated with 
+\family typewriter 
+handle
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Every handle should encode the network interface and the object id relative
+ to this handle.
+ Both are presumably encoded using integer values.
+\layout Section
+
+User Identification
+\begin_inset LatexCommand \label{sec:uid}
+
+\end_inset 
+
+
+\layout Standard
+
+Every process runs on behalf of a user.
+\layout Subsection
+
+PtlGetUid
+\layout LyX-Code
+
+int PtlGetUid( ptl_handle_ni_t   ni_handle,
+\newline 
+               ptl_uid_t*        uid );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the user id for the calling
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that user identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, a process may have multiple
+ user identifiers.
+\layout Section
+
+Process Identification
+\begin_inset LatexCommand \label{sec:pid}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes that use the Portals API, can be identified using a node id and
+ process id.
+ Every node accessible through a network interface has a unique node identifier
+ and every process running on a node has a unique process identifier.
+ As such, any process in the computing system can be identified by its node
+ id and process id.
+\layout Standard
+
+The Portals API defines a type, 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ for representing process ids and a function, 
+\emph on 
+PtlGetId
+\emph default 
+, which can be used to obtain the id of the current process.
+\layout Comment
+
+The portals API does not include thread identifiers.
+  Messages are delivered to processes (address spaces) not threads (contexts
+ of  execution).
+\layout Subsection
+
+The Process Id Type
+\begin_inset LatexCommand \label{sec:pid-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_nid_t       nid; /* node id */
+\newline 
+    ptl_pid_t       pid; /* process id */
+\newline 
+} ptl_process_id_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ type uses two identifiers to represent a process id: a node id and a process
+ id.
+\layout Subsection
+
+PtlGetId
+\begin_inset LatexCommand \label{sub:PtlGetId}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGetId( ptl_handle_ni_t   ni_handle,
+\newline 
+              ptl_process_id_t* id );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+id
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the id for the calling process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that process identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, it may have multiple
+ node identifiers.
+\layout Section
+
+Match List Entries and Match Lists
+\begin_inset LatexCommand \label{sec:me}
+
+\end_inset 
+
+
+\layout Standard
+
+A match list is a chain of match list entries.
+ Each match list entry includes a memory descriptor and a set of match criteria.
+ The match criteria can be used to reject incoming requests based on process
+ id or the match bits provided in the request.
+ A match list is created using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ or 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ functions, which create a match list consisting of a single match list
+ entry, attaches the match list to the specified Portal index, and returns
+ a handle for the match list entry.
+ Match entries can be dynamically inserted and removed from a match list
+ using the 
+\emph on 
+PtlMEInsert
+\emph default 
+ and 
+\emph on 
+PtlMEUnlink
+\emph default 
+ functions.
+\layout Subsection
+
+PtlMEAttach
+\begin_inset LatexCommand \label{sec:meattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t;
+\newline 
+
+\layout LyX-Code
+
+typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t;
+\newline 
+
+\layout LyX-Code
+
+int PtlMEAttach( ptl_handle_ni_t  interface,
+\newline 
+                 ptl_pt_index_t   index,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_unlink_t     unlink,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+ are used to control where a new item is inserted.
+ The value 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+ is used to insert the new item before the current item or before the head
+ of the list.
+ The value 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+ is used to insert the new item after the current item or after the last
+ item in the list.
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttach
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to the Portal table for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PTINDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid Portal table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="7" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The Portal table index where the match list  should be attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specifies the match criteria for the process id of the requestor.
+  The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to  wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+match_bits, ignorebits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specify the match criteria to apply  to the match bits in the incoming request.
+  The 
+\family typewriter 
+ignorebits
+\family default 
+ are used to mask out insignificant bits in the incoming match bits.
+  The resulting bits are then compared to the match list entry's match 
+ bits to determine if the incoming request meets the match criteria.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates the match list entry should be unlinked when the last memory descripto
+r associated with this match list  entry is unlinked.
+  (Note, the check for unlinking a match entry  only occurs when a memory
+ descriptor is unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be prepended or appended to
+ the existing match list.
+ If there is no existing list, this argument is ignored and the new match
+ entry becomes the only entry in the list.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEAttachAny
+\begin_inset LatexCommand \label{sec:attachany}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEAttachAny( ptl_handle_ni_t  interface,
+\newline 
+                    ptl_pt_index_t   *index,
+\newline 
+                    ptl_process_id_t matchid,
+\newline 
+                    ptl_match_bits_t match_bits,
+\newline 
+                    ptl_match_bits_t ignorebits,
+\newline 
+                    ptl_unlink_t     unlink,
+\newline 
+                    ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttachAny
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to an unused Portal table entry for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_PT_FULL Indicates that there are no free entries in the Portal table.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On succesfful return, this location will hold the Portal index where the
+ match list  has been attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid, match_bits, ignorebits, unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEInsert
+\begin_inset LatexCommand \label{sec:meinsert}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEInsert( ptl_handle_me_t  current,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEInsert
+\emph default 
+ function creates a new match list entry and inserts this entry into the
+ match list containing 
+\family typewriter 
+current
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+current
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match entry.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+current
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for a match entry.
+  The new match entry will be inserted immediately before or immediately
+ after this match entry.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\family default 
+, 
+\family typewriter 
+match_bits
+\family default 
+, 
+\family typewriter 
+ignorebits
+\family default 
+,  
+\family typewriter 
+unlink
+\family default 
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion  for 
+\emph on 
+PtlMEAttach
+\emph default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be inserted before or after
+ the 
+\family typewriter 
+current
+\family default 
+ entry.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\emph on 
+PtlMEAttach
+\emph default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEUnlink
+\begin_inset LatexCommand \label{sec:meunlink}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEUnlink( ptl_handle_me_t entry );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMEUnlink
+\emph default 
+ function can be used to unlink a match entry from a match list.
+ This operation also releases any resources associated with the match entry
+ (including the associated memory descriptor).
+ It is an error to use the match entry handle after calling 
+\emph on 
+PtlMEUnlink
+\emph default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+entry
+\family default 
+ is not a valid match entry handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+entry
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the match entry to be unlinked.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Memory Descriptors
+\begin_inset LatexCommand \label{sec:md}
+
+\end_inset 
+
+
+\layout Standard
+
+A memory descriptor contains information about a region of an application
+ process' memory and an event queue where information about the operations
+ performed on the memory descriptor are recorded.
+ The Portals API provides two operations to create memory descriptors: 
+\emph on 
+PtlMDAttach
+\emph default 
+, and 
+\emph on 
+PtlMDBind
+\emph default 
+; an operation to update a memory descriptor, 
+\emph on 
+PtlMDUpdate
+\emph default 
+; and an operation to unlink and release the resources associated with a
+ memory descriptor, 
+\emph on 
+PtlMDUnlink
+\emph default 
+.
+\layout Subsection
+
+The Memory Descriptor Type
+\begin_inset LatexCommand \label{sec:md-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    void*           start;
+\newline 
+    ptl_size_t      length;
+\newline 
+    int             threshold;
+\newline 
+    unsigned int    max_offset;
+\newline 
+    unsigned int    options;
+\newline 
+    void*           user_ptr;
+\newline 
+    ptl_handle_eq_t eventq;
+\newline 
+} ptl_md_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_md_t
+\family default 
+ type defines the application view of a memory descriptor.
+ Values of this type are used to initialize and update the memory descriptors.
+\layout Subsubsection
+
+Members
+\layout Description
+
+start,\SpecialChar ~
+length Specify the memory region associated with the memory descriptor.
+ The 
+\family typewriter 
+start
+\family default 
+ member specifies the starting address for the memory region and the 
+\family typewriter 
+length
+\family default 
+ member specifies the length of the region.
+ The 
+\family typewriter 
+start member
+\family default 
+ can be NULL provided that the 
+\family typewriter 
+length
+\family default 
+ member is zero.
+ (Zero length buffers are useful to record events.) There are no alignment
+ restrictions on the starting address or the length of the region; although,
+ unaligned messages may be slower (i.e., lower bandwidth and/or longer latency)
+ on some implementations.
+\layout Description
+
+threshold Specifies the maximum number of operations that can be performed
+ on the memory descriptor.
+ An operation is any action that could possibly generate an event (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+ for the different types of events).
+ In the usual case, the threshold value is decremented for each operation
+ on the memory descriptor.
+ When the threshold value is zero, the memory descriptor is 
+\emph on 
+inactive
+\emph default 
+, and does not respond to operations.
+ A memory descriptor can have an initial threshold value of zero to allow
+ for manipulation of an inactive memory descriptor by the local process.
+ A threshold value of 
+\family typewriter 
+PTL_MD_THRESH_INF
+\family default 
+ indicates that there is no bound on the number of operations that may be
+ applied to a memory descriptor.
+ Note that local operations (e.g., 
+\emph on 
+PtlMDUpdate
+\emph default 
+) are not applied to the threshold count.
+\layout Description
+
+max_offset Specifies the maximum local offset of a memory descriptor.
+ When the local offset of a memory descriptor exceeds this maximum, the
+ memory descriptor becomes 
+\shape italic 
+inactive
+\shape default 
+ and does not respond to further operations.
+\layout Description
+
+options Specifies the behavior of the memory descriptor.
+ There are five options that can be selected: enable put operations (yes
+ or no), enable get operations (yes or no), offset management (local or
+ remote), message truncation (yes or no), and acknowledgement (yes or no).
+ Values for this argument can be constructed using a bitwise or of the following
+ values: 
+\begin_deeper 
+\begin_deeper 
+\layout Description
+
+PTL_MD_OP_PUT Specifies that the memory descriptor will respond to 
+\emph on 
+put
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+put
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_OP_GET Specifies that the memory descriptor will respond to 
+\emph on 
+get
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+get
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory
+ region is provided by the incoming request.
+ By default, the offset is maintained locally.
+ When the offset is maintained locally, the offset is incremented by the
+ length of the request so that the next operation (put and/or get) will
+ access the next part of the memory region.
+\layout Description
+
+PTL_MD_TRUNCATE Specifies that the length provided in the incoming request
+ can be reduced to match the memory available in the region.
+ (The memory available in a memory region is determined by subtracting the
+ offset from the length of the memory region.) By default, if the length
+ in the incoming operation is greater than the amount of memory available,
+ the operation is rejected.
+\layout Description
+
+PTL_MD_ACK_DISABLE Specifies that an acknowledgement should 
+\emph on 
+not
+\emph default 
+ be sent for incoming 
+\emph on 
+put
+\emph default 
+ operations, even if requested.
+ By default, acknowledgements are sent for 
+\emph on 
+put
+\emph default 
+ operations that request an acknowledgement.
+ Acknowledgements are never sent for 
+\emph on 
+get
+\emph default 
+ operations.
+ The value sent in the reply serves as an implicit acknowledgement.
+\end_deeper 
+\layout Standard
+
+
+\series bold 
+Note
+\series default 
+: It is not considered an error to have a memory descriptor that does not
+ respond to either 
+\emph on 
+put
+\emph default 
+ or 
+\emph on 
+get
+\emph default 
+ operations: Every memory descriptor responds to 
+\emph on 
+reply
+\emph default 
+ operations.
+ Nor is it considered an error to have a memory descriptor that responds
+ to both 
+\emph on 
+put
+\emph default 
+ and 
+\emph on 
+get
+\emph default 
+ operations.
+\end_deeper 
+\layout Description
+
+user_ptr A user-specified value that is associated with the memory descriptor.
+ The value does not need to be a pointer, but must fit in the space used
+ by a pointer.
+ This value (along with other values) is recorded in events associated with
+ operations on this memory descriptor.
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Tying the memory descriptor to a user-defined value can be useful when multiple
+ memory descriptor share the same event queue or when the memory descriptor
+ needs to be associated with a data structure maintained by the application.
+ For example, an MPI implementation can set the 
+\family typewriter 
+user_ptr
+\family default 
+ argument to the value of an MPI Request.
+ This direct association allows for processing of memory descriptor's by
+ the MPI implementation without a table lookup or a search for the appropriate
+ MPI Request.
+\end_inset 
+
+
+\layout Description
+
+eventq A handle for the event queue used to log the operations performed
+ on the memory region.
+ If this argument is 
+\family typewriter 
+PTl_EQ_NONE
+\family default 
+, operations performed on this memory descriptor are not logged.
+\layout Subsection
+
+PtlMDAttach
+\begin_inset LatexCommand \label{sec:mdattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDAttach( ptl_handle_me_t  match,
+\newline 
+                 ptl_md_t         mem_desc,
+\newline 
+                 ptl_unlink_t     unlink_op,
+\newline 
+                 ptl_unlink_t     unlink_nofit,
+\newline 
+                 ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_unlink_t
+\family default 
+ are used to control whether an item is unlinked from a list.
+ The value 
+\family typewriter 
+PTL_UNLINK
+\family default 
+ enables unlinking.
+ The value 
+\family typewriter 
+PTL_RETAIN
+\family default 
+ disables unlinking.
+\layout Standard
+
+The 
+\emph on 
+PtlMDAttach
+\emph default 
+ operation is used to create a memory descriptor and attach it to a match
+ list entry.
+ An error code is returned if this match list entry already has an associated
+ memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INUSE Indicates that 
+\family typewriter 
+match
+\family default 
+ already has a memory descriptor attached.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+match
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface associated with 
+\family typewriter 
+match
+\family default 
+.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the match entry that the memory descriptor will be associated
+ with.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_op
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when it becomes
+ inactive, either because the operation threshold drops to zero or because
+ the maximum offset has been exceeded.
+  (Note, the check for unlinking a memory descriptor only occurs after a
+ the completion of a successful operation.
+  If the threshold is set to zero during initialization or  using 
+\emph on 
+PtlMDUpdate
+\emph default 
+, the memory descriptor is 
+\series bold 
+not
+\series default 
+  unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_nofit
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when the space
+ remaining in the memory descriptor is not sufficient for a matching operation.
+ If an incoming message arrives arrives at a memory descriptor that does
+ not have sufficient space and the 
+\series bold 
+PTL_MD_TRUNCATE
+\series default 
+ operation is not specified, the memory descriptor will be unlinked.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument can be NULL, in which case the handle will not be returned.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDBind
+\begin_inset LatexCommand \label{sec:mdbind}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDBind( ptl_handle_ni_t  interface,
+\newline 
+               ptl_md_t         mem_desc,
+\newline 
+               ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDBind
+\emph default 
+ operation is used to create a 
+\begin_inset Quotes eld
+\end_inset 
+
+free floating
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor, i.e., a memory descriptor that is not associated with
+ a match list entry.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface, 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that the event queue associated with 
+\family typewriter 
+mem_desc
+\family default 
+ is not valid.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the network interface with which the memory descriptor will
+ be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the  memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument must be a valid address and cannot be NULL.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUnlink
+\begin_inset LatexCommand \label{sec:mdfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUnlink( ptl_handle_md_t mem_desc );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUnlink
+\emph default 
+ function unlinks the memory descriptor from any match list entry it may
+ be linked to and releases the resources associated with a memory descriptor.
+ (This function does not free the memory region associated with the memory
+ descriptor.) This function also releases the resources associated with a
+ floating memory descriptor.
+ Only memory descriptors with no pending operations may be unlinked.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_MD_INUSE Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ has pending operations and cannot be unlinked.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUpdate
+\begin_inset LatexCommand \label{sec:mdupdate}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUpdate( ptl_handle_md_t mem_desc,
+\newline 
+                 ptl_md_t*       old_md,
+\newline 
+                 ptl_md_t*       new_md,
+\newline 
+                 ptl_handle_eq_t testq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function provides a conditional, atomic update operation for memory descriptors.
+ The memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is only updated if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ The intent is to only enable updates to the memory descriptor when no new
+ messages have arrived since the last time the queue was checked.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+ for an example of how this function can be used.
+\layout Standard
+
+If 
+\family typewriter 
+new
+\family default 
+ is not NULL the memory descriptor identified by handle will be updated
+ to reflect the values in the structure pointed to by 
+\family typewriter 
+new
+\family default 
+ if 
+\family typewriter 
+testq
+\family default 
+ has the value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+ or if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ If 
+\family typewriter 
+old
+\family default 
+ is not NULL, the current value of the memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is recorded in the location identified by 
+\family typewriter 
+old
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_NOUPDATE Indicates that the update was not performed because 
+\family typewriter 
+testq
+\family default 
+ was not empty.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_ILL_MD Indicates that the value pointed to by 
+\family typewriter 
+new
+\family default 
+ is not a legal memory descriptor (e.g., the memory region specified by the
+ memory descriptor may be invalid).
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+testq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+new
+\family default 
+ or 
+\family typewriter 
+old
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+old_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+old_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, the current value of the memory descriptor will be stored in the location
+ identified by 
+\family typewriter 
+old
+\family default 
+_md.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+new_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+new_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, this argument provides the new values for the memory descriptor, if the
+ update is performed.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+testq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for an event queue used to predicate the update.
+ If 
+\family typewriter 
+testq
+\family default 
+ is equal to 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, the update is performed unconditionally.
+  Otherwise, the update is performed if and only if 
+\family typewriter 
+testq
+\family default 
+ is empty.
+  If the update is  not performed, the function returns the value 
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+.
+  (Note, the 
+\family typewriter 
+testq
+\family default 
+ argument does not need to be the same as  the event queue associated with
+ the memory descriptor.)
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Standard
+
+The conditional update can be used to ensure that the memory descriptor
+ has not changed between the time it was examined and the time it is updated.
+ In particular, it is needed to support an MPI implementation where the
+ activity of searching an unexpected message queue and posting a receive
+ must be atomic.
+\layout Section
+
+Events and Event Queues
+\begin_inset LatexCommand \label{sec:eq}
+
+\end_inset 
+
+
+\layout Standard
+
+Event queues are used to log operations performed on memory descriptors.
+ They can also be used to hold acknowledgements for completed 
+\emph on 
+put
+\emph default 
+ operations and to note when the data specified in a 
+\emph on 
+put
+\emph default 
+ operation has been sent (i.e., when it is safe to reuse the buffer that holds
+ this data).
+ Multiple memory descriptors can share a single event queue.
+\layout Standard
+
+In addition to the 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+ type, the Portals API defines two types associated with events: The 
+\family typewriter 
+
+\newline 
+ptl_event_kind_t
+\family default 
+ type defines the kinds of events that can be stored in an event queue.
+ The 
+\family typewriter 
+ptl_event_t
+\family default 
+ type defines a structure that holds the information associated with an
+ event.
+\layout Standard
+
+The Portals API also provides four functions for dealing with event queues:
+ The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to allocate the API resources needed for an event queue,
+ the 
+\emph on 
+PtlEQFree
+\emph default 
+ function is used to release these resources, the 
+\emph on 
+PtlEQGet
+\emph default 
+ function can be used to get the next event from an event queue, and the
+\emph on 
+PtlEQWait
+\emph default 
+ function can be used to block a process (or thread) until an event queue
+ has at least one event.
+\layout Subsection
+
+Kinds of Events
+\begin_inset LatexCommand \label{sec:ek-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { 
+\newline 
+    PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL,
+\newline 
+    PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL,
+\newline 
+    PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL,
+\newline 
+    PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL,
+\newline 
+    PTL_EVENT_ACK,
+\newline 
+    PTL_EVENT_UNLINK
+\newline 
+} ptl_event_kind_t;
+\layout Standard
+\noindent 
+The Portals API defines fourteen types of events that can be logged in an
+ event queue: 
+\layout Description
+
+PTL_EVENT_GET_START A remote 
+\emph on 
+get
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_GET_END A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed successfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_GET_FAIL A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed unsuccessfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_PUT_START A remote 
+\emph on 
+put
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should should be considered
+ volatile until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_PUT_END A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed successfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_PUT_FAIL A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed unsuccessfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_REPLY_START A 
+\emph on 
+reply
+\emph default 
+ operation has been started on the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_END A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed successfully .
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_FAIL A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed unsuccessfully.
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_ACK An 
+\emph on 
+acknowledgement
+\emph default 
+ was received.
+ This event is logged when the acknowledgement is received 
+\layout Description
+
+PTL_EVENT_SEND_START An outgoing 
+\emph on 
+send
+\emph default 
+ operation has been started.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_SEND_END A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed successfully.
+ This event is logged after the entire buffer has been sent and it is safe
+ for the application to reuse the buffer.
+\layout Description
+
+PTL_EVENT_SEND_FAIL A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed unsuccessfully.
+ The process can safely manipulate the memory or free the memory descriptor
+ once it sees this event.
+\layout Description
+
+PTL_EVENT_UNLINK A memory descriptor associated with this event queue has
+ been automatically unlinked.
+ This event is not generated when a memory descriptor is explicitly unlinked
+ by calling 
+\shape italic 
+PtlMDUnlink
+\shape default 
+.
+ This event does not decrement the threshold count.
+\layout Subsection
+
+Event Ordering
+\layout Standard
+
+The Portals API guarantees that a when a process initiates two operations
+ on a remote process, the operations will be initiated on the remote process
+ in the same order that they were initiated on the original process.
+ As an example, if process A intitates two 
+\emph on 
+put
+\emph default 
+ operations, 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+, on process B, the Portals API guarantees that process A will receive the
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ in the same order that process B receives the 
+\family typewriter 
+PTL_EVENT_PUT_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+.
+ Notice that the API does not guarantee that the start events will be delivered
+ in the same order that process A initiated the 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ operations.
+ If process A needs to ensure the ordering of these operations, it should
+ include code to wait for the initiation of 
+\emph on 
+x
+\emph default 
+ before it initiates 
+\emph on 
+y
+\emph default 
+.
+\layout Subsection
+
+Failure Notification
+\layout Standard
+
+Operations may fail to complete successfully; however, unless the node itself
+ fails, every operation that is started will eventually complete.
+ While an operation is in progress, the memory associated with the operation
+ should not be viewed (in the case of a put or a reply) or altered (in the
+ case of a send or get).
+ Operation completion, whether successful or unsuccessful, is final.
+ That is, when an operation completes, the memory associated with the operation
+ will no longer be read or altered by the operation.
+ A network interface can use the 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+ to define more specific information regarding the failure of the operation
+ and record this information in the 
+\family typewriter 
+ni_fail_type
+\family default 
+ field of the event.
+\layout Subsection
+
+The Event Type
+\begin_inset LatexCommand \label{sec:event-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_event_kind_t      type;
+\newline 
+    ptl_process_id_t      initiator;
+\newline 
+    ptl_uid_t             uid;
+\layout LyX-Code
+
+    ptl_pt_index_t        portal;
+\newline 
+    ptl_match_bits_t      match_bits;
+\newline 
+    ptl_size_t            rlength;
+\newline 
+    ptl_size_t            mlength;
+\newline 
+    ptl_size_t            offset; 
+\newline 
+    ptl_handle_md_t       md_handle;
+\newline 
+    ptl_md_t              mem_desc;
+\newline 
+    ptl_hdr_data_t        hdr_data;
+\newline 
+    ptl_seq_t             link;
+\newline 
+    ptl_ni_fail_t         ni_fail_type;
+\newline 
+    volatile ptl_seq_t    sequence;
+\newline 
+} ptl_event_t;
+\layout Standard
+\noindent 
+An event structure includes the following members: 
+\layout Description
+
+type Indicates the type of the event.
+\layout Description
+
+initiator The id of the initiator.
+\layout Description
+
+portal The Portal table index specified in the request.
+\layout Description
+
+match_bits A copy of the match bits specified in the request.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for more information on match bits.
+\layout Description
+
+rlength The length (in bytes) specified in the request.
+\layout Description
+
+mlength The length (in bytes) of the data that was manipulated by the operation.
+ For truncated operations, the manipulated length will be the number of
+ bytes specified by the memory descriptor (possibly with an offset) operation.
+ For all other operations, the manipulated length will be the length of
+ the requested operation.
+\layout Description
+
+offset Is the displacement (in bytes) into the memory region that the operation
+ used.
+ The offset can be determined by the operation (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+) for a remote managed memory descriptor, or by the local memory descriptor
+ (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+).
+\layout Description
+
+md_handle Is the handle to the memory descriptor associated with the event.
+\layout Description
+
+mem_desc Is the state of the memory descriptor immediately after the event
+ has been processed.
+\layout Description
+
+hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+).
+\layout Description
+
+link The 
+\emph on 
+link
+\emph default 
+ member is used to link 
+\family typewriter 
+START
+\family default 
+ events with the 
+\family typewriter 
+END
+\family default 
+ or 
+\family typewriter 
+FAIL
+\family default 
+ event that signifies completion of the operation.
+ The 
+\emph on 
+link
+\emph default 
+ member will be the same for the two events associated with an operation.
+ The link member is also used to link an 
+\family typewriter 
+UNLINK
+\family default 
+ event with the event that caused the memory descriptor to be unlinked.
+\layout Description
+
+sequence The sequence number for this event.
+ Sequence numbers are unique to each event.
+\layout Comment
+
+The 
+\emph on 
+sequence
+\emph default 
+ member is the last member and is volatile to support SMP implementations.
+ When an event structure is filled in, the 
+\emph on 
+sequence
+\emph default 
+ member should be written after all other members have been updated.
+ Moreover, a memory barrier should be inserted between the updating of other
+ members and the updating of the 
+\emph on 
+sequence
+\emph default 
+ member.
+\layout Subsection
+
+PtlEQAlloc
+\begin_inset LatexCommand \label{sec:eqalloc}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQAlloc( ptl_handle_ni_t  interface,
+\newline 
+                ptl_size_t       count,
+\newline 
+                ptl_handle_eq_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to build an event queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ event queue.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface with which the event queue  will be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+count
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The number of events that can be stored in the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQFree
+\begin_inset LatexCommand \label{sec:eqfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQFree( ptl_handle_eq_t eventq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQFree
+\emph default 
+ function releases the resources associated with an event queue.
+ It is up to the user to insure that no memory descriptors are associated
+ with the event queue once it is freed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the event queue to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQGet
+\begin_inset LatexCommand \label{sec:eqget}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQGet( ptl_handle_eq_t eventq,
+\newline 
+              ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQGet
+\emph default 
+ function is a nonblocking function that can be used to get the next event
+ in an event queue.
+ The event is removed from the queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_EQ_EMPTY Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is empty or another thread is waiting on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQWait
+\begin_inset LatexCommand \label{sec:eqwait}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQWait( ptl_handle_eq_t eventq,
+\newline 
+               ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQWait
+\emph default 
+ function can be used to block the calling process (thread) until there
+ is an event in an event queue.
+ This function also returns the next event in the event queue and removes
+ this event from the queue.
+ This is the only blocking operation in the Portals 3.2 API.
+ In the event that multiple threads are waiting on the same event queue,
+ PtlEQWait is guaranteed to wake exactly one thread, but the order in which
+ they are awakened is not specified.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+ queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+\noindent 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue to wait on.
+  The calling process (thread) will be blocked until 
+\family typewriter 
+eventq
+\family default 
+ is not empty.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+The Access Control Table
+\begin_inset LatexCommand \label{sec:ac}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes can use the access control table to control which processes are
+ allowed to perform operations on Portal table entries.
+ Each communication interface has a Portal table and an access control table.
+ The access control table for the default interface contains an entry at
+ index zero that allows all processes with the same user id to communicate.
+ Entries in the access control table can be manipulated using the 
+\emph on 
+PtlACEntry
+\emph default 
+ function.
+\layout Subsection
+
+PtlACEntry
+\begin_inset LatexCommand \label{sec:acentry}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlACEntry( ptl_handle_ni_t  interface,
+\newline 
+                ptl_ac_index_t   index,
+\newline 
+                ptl_process_id_t matchid,
+\newline 
+                ptl_uid_t        user_id,
+\newline 
+                ptl_pt_index_t   portal );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlACEntry
+\emph default 
+ function can be used to update an entry in the access control table for
+ an interface.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_AC_INV_INDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid access control table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_PT_INV_INDEX Indicates that 
+\family typewriter 
+portal
+\family default 
+ is not a valid Portal table index.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index of the entry in the access control table to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the process(es) that are allowed to  perform operations.
+ The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+user_id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the user that is allowed to  perform operations.
+ The value 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ can be used to wildcard the user.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the Portal index(es) that can be used.
+  The value 
+\family typewriter 
+PTL_PT_INDEX_ANY
+\family default 
+ can be used to wildcard the  Portal index.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Data Movement Operations
+\begin_inset LatexCommand \label{sec:datamovement}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API provides two data movement operations: 
+\emph on 
+PtlPut
+\emph default 
+ and 
+\emph on 
+PtlGet
+\emph default 
+.
+\layout Subsection
+
+PtlPut
+\begin_inset LatexCommand \label{sec:put}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t;
+\newline 
+
+\newline 
+int PtlPut( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_ack_req_t    ack_req,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset,
+\newline 
+            ptl_hdr_data_t   hdr_data );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ack_req_t
+\family default 
+ are used to control whether an acknowledgement should be sent when the
+ operation completes (i.e., when the data has been written to a memory descriptor
+ of the 
+\family typewriter 
+target
+\family default 
+ process).
+ The value 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+ requests an acknowledgement, the value 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+ requests that no acknowledgement should be generated.
+\layout Standard
+
+The 
+\emph on 
+PtlPut
+\emph default 
+ function initiates an asynchronous put operation.
+ There are several events associated with a put operation: initiation of
+ the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+), completion of the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_END
+\family default 
+ or 
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\family default 
+), and, when the send completes successfully, the receipt of an acknowledgement
+ (
+\family typewriter 
+PTL_EVENT_ACK
+\family default 
+) indicating that the operation was accepted by the target.
+ These events will be logged in the event queue associated with the memory
+ descriptor (
+\family typewriter 
+mem_desc
+\family default 
+) used in the put operation.
+ Using a memory descriptor that does not have an associated event queue
+ results in these events being discarded.
+ In this case, the application must have another mechanism (e.g., a higher
+ level protocol) for determining when it is safe to modify the memory region
+ associated with the memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="8" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory to be sent.
+  If the memory descriptor has an event queue  associated with it, it will
+ be used to record events when the  message has been sent (PTL_EVENT_SEND_START,
+ PTL_EVENT_SEND_END).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ack_req
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Controls whether an acknowledgement event is requested.
+  Acknowledgements are only sent when they are requested by the initiating
+ process 
+\series bold 
+and
+\series default 
+ the memory descriptor has an event queue 
+\series bold 
+and
+\series default 
+ the target memory descriptor enables them.
+ Allowed constants: 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+, 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+hdr_data
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+64 bits of user data that can be included in message header.
+  This data is written to an event queue entry at the target if an event
+ queue is present on the matching memory descriptor.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlGet
+\begin_inset LatexCommand \label{sec:get}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGet( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlGet
+\emph default 
+ function initiates a remote read operation.
+ There are two event pairs associated with a get operation , when the data
+ is sent from the remote node, a 
+\family typewriter 
+PTL_EVENT_GET{START|END}
+\family default 
+ event pair is registered on the remote node; and when the data is returned
+ from the remote node a 
+\family typewriter 
+PTL_EVENT_REPLY{START|END}
+\family default 
+ event pair is registered on the local node.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="6" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory into which
+ the requested data will be received.
+  The memory descriptor can have an event queue associated with it to record
+ events, such as when the message receive has started (
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+_
+\family typewriter 
+START
+\family default 
+).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Summary
+\layout Standard
+
+
+\begin_inset LatexCommand \label{sec:summary}
+
+\end_inset 
+
+ We conclude this section by summarizing the names introduced by the Portals
+ 3.2 API.
+ We start by summarizing the names of the types introduced by the API.
+ This is followed by a summary of the functions introduced by the API.
+ Which is followed by a summary of the function return codes.
+ Finally, we conclude with a summary of the other constant values introduced
+ by the API.
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+ presents a summary of the types defined by the Portals API.
+ The first column in this table gives the type name, the second column gives
+ a brief description of the type, the third column identifies the section
+ where the type is defined, and the fourth column lists the functions that
+ have arguments of this type.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Types Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:types}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\noindent 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="25" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2in">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.2in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Sect
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Functions 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for an access control table 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlACEntry, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+acknowledgement request types 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlPut
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+kinds of events
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+information about events 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+plt_seq_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+event sequence number
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_any_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for any object 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for event queues 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert,
+ PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_me_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for match entries 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_ni_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut,
+ PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+node identifiers
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlGetId,PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+process identifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetId, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user indentifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetUid, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+insertion position (before or after) 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+identifiers for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+match (and ignore) bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mb-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ni_fail_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+network interface-specific failures
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+process identifiers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for Portal tables 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+sizes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:size-t}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_value_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+values in status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+unlink options 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+ presents a summary of the functions defined by the Portals API.
+ The first column in this table gives the name for the function, the second
+ column gives a brief description of the operation implemented by the function,
+ and the third column identifies the section where the function is defined.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Functions Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:func}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="24" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlACEntry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update an entry in an access control table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQAlloc 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the next event from an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQFree 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ release the resources for an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQWait 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ wait for a new event in an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a get operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGetId 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the id for the current process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a memory descriptor and attach it to a match entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDBind 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a free-floating memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a memory descriptor from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUpdate 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a Portal table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a free Portal table entry
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:attachany}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEInsert 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a match entry and insert it in a list 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a match entry from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIDist 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the distance to another process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIHandle 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the network interface handle for an object 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIStatus 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ read a network interface status register 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlPut 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a put operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+ summarizes the return codes used by functions defined by the Portals API.
+ All of these constants are integer values.
+ The first column of this table gives the symbolic name for the constant,
+ the second column gives a brief description of the value, and the third
+ column identifies the functions that can return this value.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Function Return Codes for the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:retcodes}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="27" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.6in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Functions
+\series default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_AC_INV_INDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_DROPPED
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+at least one event has been dropped 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet, PtlWait 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_EMPTY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no events available in an event queue 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_FAIL 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+error during initialization or cleanup 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlInit, PtlFini 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ILL_MD
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+illegal memory descriptor values 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDBind, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_DUP 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+duplicate initialization of an interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_INV
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initialization of an invalid interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+the ME already has an MD
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ASIZE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table size 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_EQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUpdate, PtlEQFree, PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_HANDLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid memory descriptor handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUnlink, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ME
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid match entry handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid network interface handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PROC 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid process identifier 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PTINDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid Portal table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_REG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_SR_INDX 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ML_TOOLONG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match list too long 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+MD has pending operations
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMDUnlink
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOINIT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+uninitialized API 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\emph default 
+, except PtlInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOSPACE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insufficient memory 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ no update was performed 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_FULL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Portal table is full
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_OK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ success 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SEGV 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+addressing violation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate,
+ PtlEQAlloc, PtlEQGet, PtlEQWait 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ summarizes the remaining constant values introduced by the Portals API.
+ The first column in this table presents the symbolic name for the constant,
+ the second column gives a brief description of the value, the third column
+ identifies the type for the value, and the fourth column identifies the
+ sections in which the value is mentioned.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Other Constants Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:oconsts}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="36" columns="5">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Base type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Intr.
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Ref.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request an acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_NONE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a NULL event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_UNLINK
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+unlink event
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PID_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for process id fields 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for node id fields
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for user id
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_IFACE_DEFAULT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+default interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_AFTER 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert after 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_BEFORE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert before 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_ACK_DISABLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to disable acknowledgements 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_MANAGE_REMOTE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable the use of remote offsets 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_GET 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable get operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_PUT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable put operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_THRESH_INF 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+infinite threshold for a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_TRUNCATE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable truncation of a request 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOACK_REQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request no acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_INDEX_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for Portal indexes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_RETAIN 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+disable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SR_DROP_COUNT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+index for the dropped count register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UNLINK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+enable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Chapter
+
+The Semantics of Message Transmission
+\begin_inset LatexCommand \label{sec:semantics}
+
+\end_inset 
+
+
+\layout Standard
+
+The portals API uses four types of messages: put requests, acknowledgements,
+ get requests, and replies.
+ In this section, we describe the information passed on the wire for each
+ type of message.
+ We also describe how this information is used to process incoming messages.
+\layout Section
+
+Sending Messages
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:put-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a put request.
+ The first column provides a descriptive name for the information, the second
+ column provides the type for this information, the third column identifies
+ the source of the information, and the fourth column provides additional
+ notes.
+ Most information that is transmitted is obtained directly from the 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ Notice that the handle for the memory descriptor used in the 
+\emph on 
+PtlPut
+\emph default 
+ operation is transmitted even though this value cannot be interpreted by
+ the target.
+ A value of anything other than 
+\family typewriter 
+PTL_MD_NONE
+\family default 
+, is interpreted as a request for an acknowledgement.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Put Request
+\begin_inset LatexCommand \label{tab:put-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="12" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlPut
+\emph default 
+ arg
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a put request 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no ack if 
+\family typewriter 
+PTL_MD_NONE
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family roman 
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+start
+\family default 
+ and 
+\family typewriter 
+length
+\family default 
+ members 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:ack-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in an acknowledgement.
+ Most of the information is simply echoed from the put request.
+ Notice that the initiator and target are obtained directly from the put
+ request, but are swapped in generating the acknowledgement.
+ The only new piece of information in the acknowledgement is the manipulated
+ length which is determined as the put request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in an Acknowledgement
+\begin_inset LatexCommand \label{tab:ack-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="10" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:get-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a get request.
+ Like the information transmitted in a put request, most of the information
+ transmitted in a get request is obtained directly from the 
+\emph on 
+PtlGet
+\emph default 
+ operation.
+ Unlike put requests, get requests do not include the event queue handle.
+ In this case, the reply is generated whenever the operation succeeds and
+ the memory descriptor must not be unlinked until the reply is received.
+ As such, there is no advantage to explicitly sending the event queue handle.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Get Request
+\begin_inset LatexCommand \label{tab:get-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlGet
+\emph default 
+ argument
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a get operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:reply-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in a reply.
+ Like an acknowledgement, most of the information is simply echoed from
+ the get request.
+ The initiator and target are obtained directly from the get request, but
+ are swapped in generating the acknowledgement.
+ The only new information in the acknowledgement are the manipulated length
+ and the data, which are determined as the get request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Reply
+\begin_inset LatexCommand \label{tab:reply-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Section
+
+Receiving Messages
+\begin_inset LatexCommand \label{sec:receiving}
+
+\end_inset 
+
+
+\layout Standard
+
+When an incoming message arrives on a network interface, the communication
+ system first checks that the target process identified in the request is
+ a valid process that has initialized the network interface (i.e., that the
+ target process has a valid Portal table).
+ If this test fails, the communication system discards the message and increment
+s the dropped message count for the interface.
+ The remainder of the processing depends on the type of the incoming message.
+ Put and get messages are subject to access control checks and translation
+ (searching a match list), while acknowledgement and reply messages bypass
+ the access control checks and the translation step.
+\layout Standard
+
+Acknowledgement messages include a handle for the memory descriptor used
+ in the original 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ This memory descriptor will identify the event queue where the event should
+ be recorded.
+ Upon receipt of an acknowledgement, the runtime system only needs to confirm
+ that the memory descriptor and event queue still exist and that there is
+ space for another event.
+ Should the any of these conditions fail,  the message is simply discarded
+ and the dropped message count for the interface is incremented.
+ Otherwise, the system builds an acknowledgement event from the information
+ in the acknowledgement message and adds it to the event queue.
+\layout Standard
+
+Reception of reply messages is also relatively straightforward.
+ Each reply message includes a handle for a memory descriptor.
+ If this descriptor exists, it is used to receive the message.
+ A reply message will be dropped if the memory descriptor identified in
+ the request doesn't exist.
+ In either of this case, the dropped message count for the interface is
+ incremented.
+ These are the only reasons for dropping reply messages.
+ Every memory descriptor accepts and truncates incoming reply messages,
+ eliminating the other potential reasons for rejecting a reply message.
+\layout Standard
+
+The critical step in processing an incoming put or get request involves
+ mapping the request to a memory descriptor.
+ This step starts by using the Portal index in the incoming request to identify
+ a list of match entries.
+ This list of match entries is searched in order until a match entry is
+ found whose match criteria matches the match bits in the incoming request
+ and whose memory descriptor accepts the request.
+\layout Standard
+
+Because acknowledge and reply messages are generated in response to requests
+ made by the process receiving these messages, the checks performed by the
+ runtime system for acknowledgements and replies are minimal.
+ In contrast, put and get messages are generated by remote processes and
+ the checks performed for these messages are more extensive.
+ Incoming put or get messages may be rejected because: 
+\layout Itemize
+
+the Portal index supplied in the request is not valid; 
+\layout Itemize
+
+the cookie supplied in the request is not a valid access control entry;
+\layout Itemize
+
+the access control entry identified by the cookie does not match the identifier
+ of the requesting process; 
+\layout Itemize
+
+the access control entry identified by the access control entry does not
+ match the Portal index supplied in the request; or 
+\layout Itemize
+
+the match bits supplied in the request do not match any of the match entries
+ with a memory descriptor that accepts the request.
+\layout Standard
+
+In all cases, if the message is rejected, the incoming message is discarded
+ and the dropped message count for the interface is incremented.
+\layout Standard
+
+A memory descriptor may reject an incoming request for any of the following
+ reasons: 
+\layout Itemize
+
+the 
+\family typewriter 
+PTL_MD_PUT
+\family default 
+ or 
+\family typewriter 
+PTL_MD_GET
+\family default 
+ option has not been enabled and the operation is put or get, respectively;
+\layout Itemize
+
+the length specified in the request is too long for the memory descriptor
+ and the 
+\family typewriter 
+PTL_MD_TRUNCATE
+\family default 
+ option has not been enabled.
+\layout Chapter
+
+Examples
+\begin_inset LatexCommand \label{sec:examples}
+
+\end_inset 
+
+
+\layout Comment
+
+The examples presented in this chapter have not been updated to reflect
+ the current API.
+\layout Standard
+
+In this section we present several example to illustrate expected usage
+ patterns for the Portals 3.2 API.
+ The first example describes how to implement parallel servers using the
+ features of the Portals 3.2 API.
+ This example covers the access control list and the use of remote managed
+ offsets.
+ The second example presents an approach to dealing with dropped requests.
+ This example covers aspects of match lists and memory descriptors.
+ The final example covers message reception in MPI.
+ This example illustrates more sophisticated uses of matching and a procedure
+ to update a memory descriptor.
+\layout Section
+
+Parallel File Servers
+\begin_inset LatexCommand \label{sec:expfs}
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:file}
+
+\end_inset 
+
+ illustrates the logical structure of a parallel file server.
+ In this case, the parallel server consists of four servers that stripe
+ application data across four disks.
+ We would like to present applications with the illusion that the file server
+ is a single entity.
+ We will assume that all of the processes that constitute the parallel server
+ have the same user id.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename file.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 196pt
+       lyxheight 147pt
+\end_inset 
+
+
+\layout Caption
+
+Parallel File Server
+\begin_inset LatexCommand \label{fig:file}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When an application establishes a connection to the parallel file server,
+ it will allocate a Portal and access control list entry for communicating
+ with the server.
+ The access control list entry will include the Portal and match any process
+ in the parallel file server's, so all of the file server processes will
+ have access to the portal.
+ The Portal information and access control entry will be sent to the file
+ server at this time.
+ If the application and server need to have multiple, concurrent I/O operations,
+ they can use additional portals or match entries to keep the operations
+ from interfering with one another.
+\layout Standard
+
+When an application initiates an I/O operation, it first builds a memory
+ descriptor that describes the memory region involved in the operation.
+ This memory descriptor will enable the appropriate operation (put for read
+ operations and get for write operations) and enable the use of remote offsets
+ (this lets the servers decide where their data should be placed in the
+ memory region).
+ After creating the memory descriptor and linking it into the appropriate
+ Portal entry, the application sends a read or write request (using 
+\emph on 
+PtlPut
+\emph default 
+) to one of the file server processes.
+ The file server processes can then use put or get operations with the appropria
+te offsets to fill or retrieve the contents of the application's buffer.
+ To know when the operation has completed, the application can add an event
+ queue to the memory descriptor and add up the lengths of the remote operations
+ until the sum is the size of the requested I/O operation.
+\layout Section
+
+Dealing with Dropped Requests
+\begin_inset LatexCommand \label{sec:exdrop}
+
+\end_inset 
+
+
+\layout Standard
+
+If a process does not anticipate unexpected requests, they will be discarded.
+ Applications using the Portals API can query the dropped count for the
+ interface to determine the number of requests that have been dropped (see
+ Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ While this approach minimizes resource consumption, it does not provide
+ information that might be critical in debugging the implementation of a
+ higher level protocol.
+\layout Standard
+
+To keep track of more information about dropped requests, we use a memory
+ descriptor that truncates each incoming request to zero bytes and logs
+ the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ operations in an event queue.
+ Note that the operations are not dropped in the Portals sense, because
+ the operation succeeds.
+\layout Standard
+
+The following code fragment illustrates an implementation of this approach.
+ In this case, we assume that a thread is launched to execute the function
+\family typewriter 
+watch_drop
+\family default 
+.
+ This code starts by building an event queue to log truncated operations
+ and a memory descriptor to truncate the incoming requests.
+ This example only captures 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests for a single portal.
+ In a more realistic situation, the memory descriptor would be appended
+ to the match list for every portal.
+ We also assume that the thread is capable of keeping up with the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests.
+ If this is not the case, we could use a finite threshold on the memory
+ descriptor to capture the first few dropped requests.
+\layout LyX-Code
+
+
+\size small 
+#include <stdio.h>
+\newline 
+#include <stdlib.h>
+\newline 
+#include <portals.h>
+\newline 
+
+\newline 
+#define DROP_SIZE 32       /* number of dropped requests to track */
+\newline 
+
+\newline 
+int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) {
+\newline 
+    ptl_handle_eq_t drop_events;
+\newline 
+    ptl_event_t event;
+\newline 
+    ptl_handle_md_t drop_em;
+\newline 
+    ptl_md_t drop_desc;
+\newline 
+    ptl_process_id_t any_proc;
+\newline 
+    ptl_handle_me_t match_any;
+\newline 
+
+\newline 
+    /* create the event queue */
+\newline 
+    if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the event queue
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* build a match entry */
+\newline 
+    any_proc.nid = PTL_ID_ANY;
+\newline 
+    any_proc.pid = PTL_ID_ANY;
+\newline 
+    PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN,
+\newline 
+                        &match_any );
+\newline 
+
+\newline 
+    /* create the memory descriptor */
+\newline 
+    drop_desc.start = NULL;
+\newline 
+    drop_desc.length = 0;
+\newline 
+    drop_desc.threshold = PTL_MD_THRESH_INF;
+\newline 
+    drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE;
+\newline 
+    drop_desc.user_ptr = NULL;
+\newline 
+    drop_desc.eventq = drop_events;
+\newline 
+    if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the memory descriptor
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* watch for "dropped" requests */
+\newline 
+    while( 1 ) {
+\newline 
+        if( PtlEQWait( drop_events, &event ) != PTL_OK ) break;
+\newline 
+        fprintf( stderr, "Dropped request from gid = event.initiator.gid,
+ event.initiator.rid );
+\newline 
+    }
+\newline 
+}
+\layout Section
+
+Message Transmission in MPI
+\begin_inset LatexCommand \label{sec:exmpi}
+
+\end_inset 
+
+
+\layout Standard
+
+We conclude this section with a fairly extensive example that describes
+ an approach to implementing message transmission for MPI.
+ Like many MPI implementations, we distinguish two message transmission
+ protocols: a short message protocol and a long message protocol.
+ We use the constant 
+\family typewriter 
+MPI_LONG_LENGTH
+\family default 
+ to determine the size of a long message.
+\layout Standard
+
+For small messages, the sender simply sends the message and presumes that
+ the message will be received (i.e., the receiver has allocated a memory region
+ to receive the message body).
+ For large messages, the sender also sends the message, but does not presume
+ that the message body will be saved.
+ Instead, the sender builds a memory descriptor for the message and enables
+ get operations on this descriptor.
+ If the target does not save the body of the message, it will record an
+ event for the put operation.
+ When the process later issues a matching MPI receive, it will perform a
+ get operation to retrieve the body of the message.
+\layout Standard
+
+To facilitate receive side matching based on the protocol, we use the most
+ significant bit in the match bits to indicate the protocol: 1 for long
+ messages and 0 for short messages.
+\layout Standard
+
+The following code presents a function that implements the send side of
+ the protocol.
+ The global variable 
+\family typewriter 
+EndGet
+\family default 
+ is the last match entry attached to the Portal index used for posting long
+ messages.
+ This entry does not match any incoming requests (i.e., the memory descriptor
+ rejects all get operations) and is built during initialization of the MPI
+ library.
+ The other global variable, 
+\family typewriter 
+MPI_NI
+\family default 
+, is a handle for the network interface used by the MPI implementation.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_me_t EndGet;
+\newline 
+extern ptl_handle_ni_t MPI_NI;
+\newline 
+
+\newline 
+void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq,
+\newline 
+                    ptl_process_id target, ptl_match_bits_t match ) 
+\newline 
+{
+\newline 
+    ptl_handle_md_t send_handle;
+\newline 
+    ptl_md_t mem_desc;
+\newline 
+    ptl_ack_req_t want_ack;
+\newline 
+
+\newline 
+    mem_desc.start = buf;
+\newline 
+    mem_desc.length = len;
+\newline 
+    mem_desc.threshold = 1;
+\newline 
+    mem_desc.options = PTL_MD_GET_OP;
+\newline 
+    mem_desc.user_ptr = data;
+\newline 
+    mem_desc.eventq = eventq;
+\newline 
+
+\newline 
+    if( len >= MPI_LONG_LENGTH ) {
+\newline 
+        ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+        /* add a match entry to the end of the get list */
+\newline 
+        PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet,
+ &me_handle );
+\newline 
+        PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL );
+\newline 
+
+\newline 
+        /* we want an ack for long messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a long message
+ */
+\newline 
+        match |= 1<<63;
+\newline 
+    } else {
+\newline 
+        /* we don't want an ack for short messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a short message
+ */
+\newline 
+        match &= ~(1<<63);
+\newline 
+    }
+\newline 
+
+\newline 
+   /* create a memory descriptor and send it */
+\newline 
+   PtlMDBind( MPI_NI, mem_desc, &send_handle );
+\newline 
+   PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match,
+ 0 );
+\newline 
+}
+\layout Standard
+
+The 
+\emph on 
+MPISend
+\emph default 
+ function returns as soon as the message has been scheduled for transmission.
+ The event queue argument, 
+\family typewriter 
+eventq
+\family default 
+, can be used to determine the disposition of the message.
+ Assuming that 
+\family typewriter 
+eventq
+\family default 
+ is not 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, a 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+ event will be recorded for each message as the message is transmitted.
+ For small messages, this is the only event that will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+ In contrast, long messages include an explicit request for an acknowledgement.
+ If the 
+\family typewriter 
+target
+\family default 
+ process has posted a matching receive, the acknowledgement will be sent
+ as the message is received.
+ If a matching receive has not been posted, the message will be discarded
+ and no acknowledgement will be sent.
+ When the 
+\family typewriter 
+target
+\family default 
+ process later issues a matching receive, the receive will be translated
+ into a get operation and a 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+ event will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:mpi}
+
+\end_inset 
+
+ illustrates the organization of the match list used for receiving MPI messages.
+ The initial entries (not shown in this figure) would be used to match the
+ MPI receives that have been preposted by the application.
+ The preposted receives are followed by a match entry, 
+\emph on 
+RcvMark
+\emph default 
+, that marks the boundary between preposted receives and the memory descriptors
+ used for 
+\begin_inset Quotes eld
+\end_inset 
+
+unexpected
+\begin_inset Quotes erd
+\end_inset 
+
+ messages.
+ The 
+\emph on 
+RcvMark
+\emph default 
+ entry is followed by a small collection of match entries that match unexpected
+\begin_inset Quotes eld
+\end_inset 
+
+short
+\begin_inset Quotes erd
+\end_inset 
+
+ messages, i.e., messages that have a 0 in the most significant bit of their
+ match bits.
+ The memory descriptors associated with these match entries will append
+ the incoming message to the associated memory descriptor and record an
+ event in an event queue for unexpected messages.
+ The unexpected short message matching entries are followed by a match entry
+ that will match messages that were not matched by the preceding match entries,
+ i.e., the unexpected long messages.
+ The memory descriptor associated with this match entry truncates the message
+ body and records an event in the event queue for unexpected messages.
+ Note that of the memory descriptors used for unexpected messages share
+ a common event queue.
+ This makes it possible to process the unexpected messages in the order
+ in which they arrived, regardless of.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename mpi.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 389pt
+       lyxheight 284pt
+\end_inset 
+
+
+\layout Caption
+
+Message Reception in MPI
+\begin_inset LatexCommand \label{fig:mpi}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When the local MPI process posts an MPI receive, we must first search the
+ events unexpected message queue to see if a matching message has already
+ arrived.
+ If no matching message is found, a match entry for the receive is inserted
+ before the 
+\emph on 
+RcvMark
+\emph default 
+ entry--after the match entries for all of the previously posted receives
+ and before the match entries for the unexpected messages.
+ This ensures that preposted receives are matched in the order that they
+ were posted (a requirement of MPI).
+\layout Standard
+
+While this strategy respects the temporal semantics of MPI, it introduces
+ a race condition: a matching message might arrive after the events in the
+ unexpected message queue have been searched, but before the match entry
+ for the receive has been inserted in the match list.
+\layout Standard
+
+To avoid this race condition we start by setting the 
+\family typewriter 
+threshold
+\family default 
+ of the memory descriptor to 0, making the descriptor inactive.
+ We then insert the match entry into the match list and proceed to search
+ the events in the unexpected message queue.
+ A matching message that arrives as we are searching the unexpected message
+ queue will not be accepted by the memory descriptor and, if not matched
+ by an earlier match list element, will add an event to the unexpected message
+ queue.
+ After searching the events in the unexpected message queue, we update the
+ memory descriptor, setting the threshold to 1 to activate the memory descriptor.
+ This update is predicated by the condition that the unexpected message
+ queue is empty.
+ We repeat the process of searching the unexpected message queue until the
+ update succeeds.
+\layout Standard
+
+The following code fragment illustrates this approach.
+ Because events must be removed from the unexpected message queue to be
+ examined, this code fragment assumes the existence of a user managed event
+ list, 
+\family typewriter 
+Rcvd
+\family default 
+, for the events that have already been removed from the unexpected message
+ queue.
+ In an effort to keep the example focused on the basic protocol, we have
+ omitted the code that would be needed to manage the memory descriptors
+ used for unexpected short messages.
+ In particular, we simply leave messages in these descriptors until they
+ are received by the application.
+ In a robust implementation, we would introduce code to ensure that short
+ unexpected messages are removed from these memory descriptors so that they
+ can be re-used.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_eq_t UnexpQueue;
+\newline 
+extern ptl_handle_me_t RcvMark;
+\newline 
+extern ptl_handle_me_t ShortMatch;
+\newline 
+
+\newline 
+typedef struct event_list_tag {
+\newline 
+    ptl_event_t            event;
+\newline 
+    struct event_list_tag* next;
+\newline 
+} event_list;
+\newline 
+
+\newline 
+extern event_list Rcvd;
+\newline 
+
+\newline 
+void AppendRcvd( ptl_event_t event )
+\newline 
+{
+\newline 
+    /* append an event onto the Rcvd list */
+\newline 
+}
+\newline 
+
+\newline 
+int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi
+ts_t match,
+\newline 
+                       ptl_match_bits_t ignore, ptl_event_t *event )
+\newline 
+{
+\newline 
+    /* Search the Rcvd event queue, looking for a message that matches the
+ requested message.
+\newline 
+     * If one is found, remove the event from the Rcvd list and return it.
+ */
+\newline 
+}
+\newline 
+
+\newline 
+typedef enum { RECEIVED, POSTED } receive_state;
+\newline 
+
+\newline 
+receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event,
+ ptl_md_t md_buf )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+    if( event.rlength >= MPI_LONG_LENGTH ) {
+\newline 
+        PtlMDBind( MPI_NI, md_buf, &md_handle );
+\newline 
+        PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX,
+ md_handle );
+\newline 
+        return POSTED;
+\newline 
+    } else {
+\newline 
+        /* copy the message */
+\newline 
+        if( event.mlength < *length ) *length = event.mlength;
+\newline 
+        memcpy( buf, (char*)event.md_desc.start+event.offset, *length );
+\newline 
+        return RECEIVED;
+\newline 
+    }
+\newline 
+}
+\newline 
+
+\newline 
+receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle
+_eq_t eventq, 
+\newline 
+                           ptl_process_id_t sender, ptl_match_bits_t match,
+ ptl_match_bits_t ignore )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_md_t md_handle;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+    ptl_event_t event;
+\newline 
+
+\newline 
+    /* build a memory descriptor for the receive */
+\newline 
+    md_buf.start = buf;
+\newline 
+    md_buf.length = *len;
+\newline 
+    md_buf.threshold = 0;     /* temporarily disabled */
+\newline 
+    md_buf.options = PTL_MD_PUT_OP;
+\newline 
+    md_buf.user_ptr = MPI_data;
+\newline 
+    md_buf.eventq = eventq;
+\newline 
+
+\newline 
+    /* see if we have already received the message */
+\newline 
+    if( SearchRcvd(buf, len, sender, match, ignore, &event) )
+\newline 
+         return CopyMsg( buf, len, event, md_buf );
+\newline 
+
+\newline 
+    /* create the match entry and attach the  memory descriptor */
+\newline 
+    PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark,
+ &me_handle);
+\newline 
+    PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle );
+\newline 
+
+\newline 
+    md_buf.threshold = 1;
+\newline 
+    do
+\newline 
+        if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) {
+\newline 
+            if( MPIMatch(event, match, ignore, sender) ) {
+\newline 
+                return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset,
+ md_buf );
+\newline 
+            } else {
+\newline 
+                AppendRcvd( event );
+\newline 
+            }
+\newline 
+        }
+\newline 
+    while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE
+ );
+\newline 
+    return POSTED;
+\newline 
+}
+\layout Chapter*
+
+Acknowledgments
+\layout Standard
+
+Several people have contributed to the philosophy, design, and implementation
+ of the Portals message passing architecture as it has evolved.
+ We acknowledge the following people for their contributions: Al Audette,
+ Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike
+ Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke,
+ Dave van Dresser, Lee Ward, and Stephen Wheat.
+\layout Standard
+
+
+\begin_inset LatexCommand \BibTeX[ieee]{portals3}
+
+\end_inset 
+
+
+\the_end
diff --git a/lustre/portals/doc/put.fig b/lustre/portals/doc/put.fig
new file mode 100644 (file)
index 0000000..5235b6d
--- /dev/null
@@ -0,0 +1,32 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1350 900 2175 1200
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1275 2700 1725
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 1200
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2699 1788 899 1938
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001
+4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001
+4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am
new file mode 100644 (file)
index 0000000..2cf7f99
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = portals linux
+EXTRA_DIST = config.h.in
+include $(top_srcdir)/Rules
diff --git a/lustre/portals/include/config.h.in b/lustre/portals/include/config.h.in
new file mode 100644 (file)
index 0000000..b05d0c4
--- /dev/null
@@ -0,0 +1,11 @@
+/* ../include/config.h.in.  Generated automatically from configure.in by autoheader.  */
+
+/* Define if you have the readline library (-lreadline).  */
+#undef HAVE_LIBREADLINE
+
+/* Name of package */
+#undef PACKAGE
+
+/* Version number of package */
+#undef VERSION
+
diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..6a65cb5
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(top_srcdir)/Rules
+
+linuxincludedir = $(includedir)/linux
+
+linuxinclude_HEADERS=kp30.h portals_lib.h
diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h
new file mode 100644 (file)
index 0000000..4915fe3
--- /dev/null
@@ -0,0 +1,936 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _KP30_INCLUDED
+#define _KP30_INCLUDED
+
+
+#define PORTAL_DEBUG
+
+#ifndef offsetof
+# define offsetof(typ,memb)    ((int)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define LOWEST_BIT_SET(x)      ((x) & ~((x) - 1))
+
+#ifndef CONFIG_SMP
+# define smp_processor_id() 0
+#endif
+
+/*
+ *  Debugging
+ */
+extern unsigned int portal_subsystem_debug;
+extern unsigned int portal_stack;
+extern unsigned int portal_debug;
+extern unsigned int portal_printk;
+/* Debugging subsystems  (8 bit ID)
+ *
+ * If you add debug subsystem #32, you need to send email to phil, because
+ * you're going to break kernel subsystem debug filtering. */
+#define S_UNDEFINED    (0 << 24)
+#define S_MDC          (1 << 24)
+#define S_MDS          (2 << 24)
+#define S_OSC          (3 << 24)
+#define S_OST          (4 << 24)
+#define S_CLASS        (5 << 24)
+#define S_OBDFS        (6 << 24) /* obsolete */
+#define S_LLITE        (7 << 24)
+#define S_RPC          (8 << 24)
+#define S_EXT2OBD      (9 << 24) /* obsolete */
+#define S_PORTALS     (10 << 24)
+#define S_SOCKNAL     (11 << 24)
+#define S_QSWNAL      (12 << 24)
+#define S_PINGER      (13 << 24)
+#define S_FILTER      (14 << 24)
+#define S_TRACE       (15 << 24) /* obsolete */
+#define S_ECHO        (16 << 24)
+#define S_LDLM        (17 << 24)
+#define S_LOV         (18 << 24)
+#define S_GMNAL       (19 << 24)
+#define S_PTLROUTER   (20 << 24)
+#define S_COBD        (21 << 24)
+#define S_PTLBD       (22 << 24)
+#define S_LOG         (23 << 24)
+
+/* If you change these values, please keep portals/linux/utils/debug.c
+ * up to date! */
+
+/* Debugging masks (24 bits, non-overlapping) */
+#define D_TRACE     (1 << 0) /* ENTRY/EXIT markers */
+#define D_INODE     (1 << 1)
+#define D_SUPER     (1 << 2)
+#define D_EXT2      (1 << 3) /* anything from ext2_debug */
+#define D_MALLOC    (1 << 4) /* print malloc, free information */
+#define D_CACHE     (1 << 5) /* cache-related items */
+#define D_INFO      (1 << 6) /* general information */
+#define D_IOCTL     (1 << 7) /* ioctl related information */
+#define D_BLOCKS    (1 << 8) /* ext2 block allocation */
+#define D_NET       (1 << 9) /* network communications */
+#define D_WARNING   (1 << 10)
+#define D_BUFFS     (1 << 11)
+#define D_OTHER     (1 << 12)
+#define D_DENTRY    (1 << 13)
+#define D_PORTALS   (1 << 14) /* ENTRY/EXIT markers */
+#define D_PAGE      (1 << 15) /* bulk page handling */
+#define D_DLMTRACE  (1 << 16)
+#define D_ERROR     (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG     (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA        (1 << 19) /* recovery and failover */
+#define D_RPCTRACE  (1 << 20) /* for distributed debugging */
+#define D_VFSTRACE  (1 << 21)
+
+#ifndef THREAD_SIZE
+#define THREAD_SIZE 8192
+#endif
+#ifdef  __arch_ia64__
+#define CDEBUG_STACK(var) (&var & (THREAD_SIZE - 1))
+#else
+#define CDEBUG_STACK(var) (THREAD_SIZE -                                      \
+                           ((unsigned long)__builtin_frame_address(0)&        \
+                            (THREAD_SIZE - 1)))
+#endif
+
+#ifdef __KERNEL__
+#define CHECK_STACK(stack)                                                    \
+        do {                                                                  \
+                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack)      \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR,           \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          (stack),                            \
+                                          "maximum lustre stack %u\n",        \
+                                          portal_stack = (stack));            \
+        } while (0)
+#else
+#define CHECK_STACK(stack) do{}while(0)
+#endif
+
+#define CDEBUG(mask, format, a...)                                            \
+do {                                                                          \
+        unsigned long stack = CDEBUG_STACK(stack);                            \
+        int match = 0;                                                        \
+                                                                              \
+        CHECK_STACK(stack);                                                   \
+        if (!(mask))                                                          \
+                match = 1;                                                    \
+        else if ((mask) & (D_ERROR | D_EMERG))                                \
+                match = 1;                                                    \
+        else if (portal_debug & (mask) &&                                     \
+                 portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24)))     \
+                match = 1;                                                    \
+        if (match)                                                            \
+                portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  stack, format , ## a);                      \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
+
+#define GOTO(label, rc)                                                 \
+do {                                                                    \
+        long GOTO__ret = (long)(rc);                                    \
+        CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \
+               #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\
+               (signed long)GOTO__ret);                                 \
+        goto label;                                                     \
+} while (0)
+
+#define RETURN(rc)                                                      \
+do {                                                                    \
+        typeof(rc) RETURN__ret = (rc);                                  \
+        long tmp = (long)RETURN__ret;                                   \
+        CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n",       \
+               (unsigned long)tmp, (signed long)tmp,                    \
+               (signed long)tmp);                                       \
+        return RETURN__ret;                                             \
+} while (0)
+
+#define ENTRY                                                           \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process entered\n");                           \
+} while (0)
+
+#define EXIT                                                            \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process leaving\n");                           \
+} while(0)
+
+
+#ifdef __KERNEL__
+# include <linux/vmalloc.h>
+# include <linux/time.h>
+# include <linux/slab.h>
+# include <linux/interrupt.h>
+# include <linux/highmem.h>
+# include <linux/module.h>
+# include <linux/version.h>
+# include <portals/lib-nal.h>
+# include <linux/smp_lock.h>
+# include <asm/atomic.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define schedule_work schedule_task
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_TQUEUE((wq), 0, 0);                                              \
+        PREPARE_TQUEUE((wq), (cb), (cbdata));                                 \
+} while (0)
+
+#define ll_invalidate_inode_pages invalidate_inode_pages
+#define PageUptodate Page_Uptodate
+#define our_recalc_sigpending(current) recalc_sigpending(current)
+#define num_online_cpus() smp_num_cpus
+static inline void our_cond_resched(void)
+{
+        if (current->need_resched)
+               schedule ();
+}
+
+#else
+
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_WORK((wq), (void *)(cb), (void *)(cbdata));                      \
+} while (0)
+#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping)
+#define wait_on_page wait_on_page_locked
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+static inline void our_cond_resched(void)
+{
+        cond_resched();
+}
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */
+
+#ifdef PORTAL_DEBUG
+extern void kportal_assertion_failed(char *expr,char *file,char *func,int line);
+#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__,  \
+                                                        __FUNCTION__, __LINE__))
+#else
+#define LASSERT(e)
+#endif
+
+#ifdef __arch_um__
+#define LBUG()                                                          \
+do {                                                                    \
+        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__);      \
+        panic("LBUG");                                                  \
+} while (0)
+#else
+#define LBUG()                                                          \
+do {                                                                    \
+        CEMERG("LBUG\n");                                               \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(__FILE__, __FUNCTION__, __LINE__);      \
+        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
+        schedule();                                                     \
+} while (0)
+#endif /* __arch_um__ */
+
+/*
+ * Memory
+ */
+#ifdef PORTAL_DEBUG
+extern atomic_t portal_kmemory;
+
+# define portal_kmem_inc(ptr, size)                                           \
+do {                                                                          \
+        atomic_add(size, &portal_kmemory);                                    \
+} while (0)
+
+# define portal_kmem_dec(ptr, size) do {                                      \
+        atomic_sub(size, &portal_kmemory);                                    \
+} while (0)
+
+#else
+# define portal_kmem_inc(ptr, size) do {} while (0)
+# define portal_kmem_dec(ptr, size) do {} while (0)
+#endif /* PORTAL_DEBUG */
+
+#define PORTAL_VMALLOC_SIZE        16384
+
+#define PORTAL_ALLOC(ptr, size)                                           \
+do {                                                                      \
+        long s = size;                                                    \
+        LASSERT (!in_interrupt());                                        \
+        if (s > PORTAL_VMALLOC_SIZE)                                      \
+                (ptr) = vmalloc(s);                                       \
+        else                                                              \
+                (ptr) = kmalloc(s, GFP_KERNEL);                           \
+        if ((ptr) == NULL)                                                \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s);    \
+        else {                                                            \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_FREE(ptr, size)                                          \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        if (s > PORTAL_VMALLOC_SIZE)                                    \
+                vfree(ptr);                                             \
+        else                                                            \
+                kfree(ptr);                                             \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+#define PORTAL_SLAB_ALLOC(ptr, slab, size)                                \
+do {                                                                      \
+        long s = (size);                                                  \
+        LASSERT (!in_interrupt());                                        \
+        (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL);                    \
+        if ((ptr) == NULL) {                                              \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' from slab '" #slab "')\n", __FILE__,  \
+                       __LINE__);                                         \
+        } else {                                                          \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_SLAB_FREE(ptr, slab, size)                               \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        memset((ptr), 0x5a, s);                                         \
+        kmem_cache_free((slab), ptr);                                   \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+/* ------------------------------------------------------------------- */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
+#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x)
+
+#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x))
+#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
+
+#define PORTAL_MODULE_USE       MOD_INC_USE_COUNT
+#define PORTAL_MODULE_UNUSE     MOD_DEC_USE_COUNT
+#else
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+#define PORTAL_SYMBOL_GET(x) symbol_get(x)
+#define PORTAL_SYMBOL_PUT(x) symbol_put(x)
+
+#define PORTAL_MODULE_USE       try_module_get(THIS_MODULE)
+#define PORTAL_MODULE_UNUSE     module_put(THIS_MODULE)
+
+#endif
+
+/******************************************************************************/
+/* Kernel Portals Router interface */
+
+typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback
+
+/* space for routing targets to stash "stuff" in a forwarded packet */
+typedef union {
+        long long        _alignment;
+        void            *_space[16];            /* scale with CPU arch */
+} kprfd_scratch_t;
+
+/* Kernel Portals Routing Forwarded message Descriptor */
+typedef struct {
+        struct list_head     kprfd_list;        /* stash in queues (routing target can use) */
+        ptl_nid_t            kprfd_target_nid;  /* final destination NID */
+        ptl_nid_t            kprfd_gateway_nid; /* gateway NID */
+        int                  kprfd_nob;         /* # message bytes (including header) */
+        int                  kprfd_niov;        /* # message frags (including header) */
+        struct iovec        *kprfd_iov;         /* message fragments */
+        void                *kprfd_router_arg;  // originating NAL's router arg
+        kpr_fwd_callback_t   kprfd_callback;    /* completion callback */
+        void                *kprfd_callback_arg; /* completion callback arg */
+        kprfd_scratch_t      kprfd_scratch;    // scratchpad for routing targets
+} kpr_fwd_desc_t;
+
+typedef void  (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+
+/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
+typedef const struct {
+        int             kprni_nalid;    /* NAL's id */
+        void           *kprni_arg;      /* Arg to pass when calling into NAL */
+        kpr_fwd_t       kprni_fwd;      /* NAL's forwarding entrypoint */
+} kpr_nal_interface_t;
+
+/* Router's routing interface (Kernel Portals Routing Router Interface) */
+typedef const struct {
+        /* register the calling NAL with the router and get back the handle for
+         * subsequent calls */
+        int     (*kprri_register) (kpr_nal_interface_t *nal_interface,
+                                   void **router_arg);
+
+        /* ask the router to find a gateway that forwards to 'nid' and is a peer
+         * of the calling NAL */
+        int     (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+                                 ptl_nid_t *gateway_nid);
+
+        /* hand a packet over to the router for forwarding */
+        kpr_fwd_t kprri_fwd_start;
+
+        /* hand a packet back to the router for completion */
+        void    (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
+                                   int error);
+
+        /* the calling NAL is shutting down */
+        void    (*kprri_shutdown) (void *router_arg);
+
+        /* deregister the calling NAL with the router */
+        void    (*kprri_deregister) (void *router_arg);
+
+} kpr_router_interface_t;
+
+/* Convenient struct for NAL to stash router interface/args */
+typedef struct {
+        kpr_router_interface_t  *kpr_interface;
+        void                    *kpr_arg;
+} kpr_router_t;
+
+/* Router's control interface (Kernel Portals Routing Control Interface) */
+typedef const struct {
+        int     (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
+                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+        int     (*kprci_del_route)(ptl_nid_t nid);
+        int     (*kprci_get_route)(int index, int *gateway_nal,
+                                   ptl_nid_t *gateway, ptl_nid_t *lo_nid,
+                                   ptl_nid_t *hi_nid);
+} kpr_control_interface_t;
+
+extern kpr_control_interface_t  kpr_control_interface;
+extern kpr_router_interface_t   kpr_router_interface;
+
+static inline int
+kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif)
+{
+        int    rc;
+
+        router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface);
+        if (router->kpr_interface == NULL)
+                return (-ENOENT);
+
+        rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg);
+        if (rc != 0)
+                router->kpr_interface = NULL;
+
+        PORTAL_SYMBOL_PUT (kpr_router_interface);
+        return (rc);
+}
+
+static inline int
+kpr_routing (kpr_router_t *router)
+{
+        return (router->kpr_interface != NULL);
+}
+
+static inline int
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+{
+        if (!kpr_routing (router))
+                return (-EHOSTUNREACH);
+
+        return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+                                                    gateway_nid));
+}
+
+static inline void
+kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, 
+              int nob, int niov, struct iovec *iov, 
+              kpr_fwd_callback_t callback, void *callback_arg)
+{
+        fwd->kprfd_target_nid   = nid;
+        fwd->kprfd_gateway_nid  = nid;
+        fwd->kprfd_nob          = nob;
+        fwd->kprfd_niov         = niov;
+        fwd->kprfd_iov          = iov;
+        fwd->kprfd_callback     = callback;
+        fwd->kprfd_callback_arg = callback_arg;
+}
+
+static inline void
+kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
+{
+        if (!kpr_routing (router))
+                fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+        else
+                router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
+}
+
+static inline void
+kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error)
+{
+        LASSERT (kpr_routing (router));
+        router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error);
+}
+
+static inline void
+kpr_shutdown (kpr_router_t *router)
+{
+        if (kpr_routing (router))
+                router->kpr_interface->kprri_shutdown (router->kpr_arg);
+}
+
+static inline void
+kpr_deregister (kpr_router_t *router)
+{
+        if (!kpr_routing (router))
+                return;
+        router->kpr_interface->kprri_deregister (router->kpr_arg);
+        router->kpr_interface = NULL;
+}
+
+/******************************************************************************/
+
+#ifdef PORTALS_PROFILING
+#define prof_enum(FOO) PROF__##FOO
+enum {
+        prof_enum(our_recvmsg),
+        prof_enum(our_sendmsg),
+        prof_enum(socknal_recv),
+        prof_enum(lib_parse),
+        prof_enum(conn_list_walk),
+        prof_enum(memcpy),
+        prof_enum(lib_finalize),
+        prof_enum(pingcli_time),
+        prof_enum(gmnal_send),
+        prof_enum(gmnal_recv),
+        MAX_PROFS
+};
+
+struct prof_ent {
+        char *str;
+        /* hrmph.  wrap-tastic. */
+        u32       starts;
+        u32       finishes;
+        cycles_t  total_cycles;
+        cycles_t  start;
+        cycles_t  end;
+};
+
+extern struct prof_ent prof_ents[MAX_PROFS];
+
+#define PROF_START(FOO)                                         \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->starts++;                                   \
+                pe->start = get_cycles();                       \
+        } while (0)
+
+#define PROF_FINISH(FOO)                                        \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->finishes++;                                 \
+                pe->end = get_cycles();                         \
+                pe->total_cycles += (pe->end - pe->start);      \
+        } while (0)
+#else /* !PORTALS_PROFILING */
+#define PROF_START(FOO) do {} while(0)
+#define PROF_FINISH(FOO) do {} while(0)
+#endif /* PORTALS_PROFILING */
+
+/* debug.c */
+void portals_run_lbug_upcall(char * file, char *fn, int line);
+void portals_debug_dumplog(void);
+int portals_debug_init(unsigned long bufsize);
+int portals_debug_cleanup(void);
+int portals_debug_clear_buffer(void);
+int portals_debug_mark_buffer(char *text);
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                char *file, unsigned int size);
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len);
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                        unsigned long stack, const char *format, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#else
+void portals_debug_msg (int subsys, int mask, char *file, char *fn,
+                        int line, unsigned long stack,
+                        const char *format, ...);
+#endif /* __GNUC__ */
+void portals_debug_set_level(unsigned int debug_level);
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+extern void kportal_daemonize (char *name);
+extern void kportal_blockallsigs (void);
+
+#else  /* !__KERNEL__ */
+# include <stdio.h>
+# include <stdlib.h>
+#ifndef __CYGWIN__
+# include <stdint.h>
+#endif
+# include <unistd.h>
+# include <time.h>
+# include <asm/types.h>
+# ifndef DEBUG_SUBSYSTEM
+#  define DEBUG_SUBSYSTEM S_UNDEFINED
+# endif
+# ifdef PORTAL_DEBUG
+#  undef NDEBUG
+#  include <assert.h>
+#  define LASSERT(e)   assert(e)
+# else
+#  define LASSERT(e)
+# endif
+# define printk(format, args...) printf (format, ## args)
+# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
+# define PORTAL_FREE(a, b) do { free(a); } while (0);
+# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \
+    printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
+            (subsys) >> 24, (mask), (long)time(0), file, fn, line,            \
+            getpid() , stack, ## a);
+#endif
+
+#ifndef CURRENT_TIME
+# define CURRENT_TIME time(0)
+#endif
+
+#include <linux/portals_lib.h>
+
+/*
+ * USER LEVEL STUFF BELOW
+ */
+
+#define PORTAL_IOCTL_VERSION 0x00010007
+#define PING_SYNC       0
+#define PING_ASYNC      1
+
+struct portal_ioctl_data {
+        __u32 ioc_len;
+        __u32 ioc_version;
+        __u64 ioc_nid;
+        __u64 ioc_nid2;
+        __u64 ioc_nid3;
+        __u32 ioc_count;
+        __u32 ioc_nal;
+        __u32 ioc_nal_cmd;
+        __u32 ioc_fd;
+        __u32 ioc_id;
+
+        __u32 ioc_flags;
+        __u32 ioc_size;
+
+        __u32 ioc_wait;
+        __u32 ioc_timeout;
+        __u32 ioc_misc;
+
+        __u32 ioc_inllen1;
+        char *ioc_inlbuf1;
+        __u32 ioc_inllen2;
+        char *ioc_inlbuf2;
+
+        __u32 ioc_plen1; /* buffers in userspace */
+        char *ioc_pbuf1;
+        __u32 ioc_plen2; /* buffers in userspace */
+        char *ioc_pbuf2;
+
+        char ioc_bulk[0];
+};
+
+struct portal_ioctl_hdr {
+        __u32 ioc_len;
+        __u32 ioc_version;
+};
+
+struct portals_debug_ioctl_data
+{
+        struct portal_ioctl_hdr hdr;
+        unsigned int subs;
+        unsigned int debug;
+};
+
+#define PORTAL_IOC_INIT(data)                           \
+do {                                                    \
+        memset(&data, 0, sizeof(data));                 \
+        data.ioc_version = PORTAL_IOCTL_VERSION;        \
+        data.ioc_len = sizeof(data);                    \
+} while (0)
+
+/* FIXME check conflict with lustre_lib.h */
+#define PTL_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+
+static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
+{
+        int len = sizeof(*data);
+        len += size_round(data->ioc_inllen1);
+        len += size_round(data->ioc_inllen2);
+        return len;
+}
+
+static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data)
+{
+        if (data->ioc_len > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+                CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+                CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf1 && !data->ioc_plen1) {
+                CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf2 && !data->ioc_plen2) {
+                CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_plen1 && !data->ioc_pbuf1) {
+                CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+                return 1;
+        }
+        if (data->ioc_plen2 && !data->ioc_pbuf2) {
+                CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+                return 1;
+        }
+        if (portal_ioctl_packlen(data) != data->ioc_len ) {
+                CERROR ("PORTALS ioctl: packlen != ioc_len\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 &&
+            data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 &&
+            data->ioc_bulk[size_round(data->ioc_inllen1) +
+                           data->ioc_inllen2 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n");
+                return 1;
+        }
+        return 0;
+}
+
+#ifndef __KERNEL__
+static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
+                                    int max)
+{
+        char *ptr;
+        struct portal_ioctl_data *overlay;
+        data->ioc_len = portal_ioctl_packlen(data);
+        data->ioc_version = PORTAL_IOCTL_VERSION;
+
+        if (*pbuf && portal_ioctl_packlen(data) > max)
+                return 1;
+        if (*pbuf == NULL) {
+                *pbuf = malloc(data->ioc_len);
+        }
+        if (!*pbuf)
+                return 1;
+        overlay = (struct portal_ioctl_data *)*pbuf;
+        memcpy(*pbuf, data, sizeof(*data));
+
+        ptr = overlay->ioc_bulk;
+        if (data->ioc_inlbuf1)
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        if (data->ioc_inlbuf2)
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        if (portal_ioctl_is_invalid(overlay))
+                return 1;
+
+        return 0;
+}
+#else
+#include <asm/uaccess.h>
+
+/* buffer MUST be at least the size of portal_ioctl_hdr */
+static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct portal_ioctl_hdr *hdr;
+        struct portal_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct portal_ioctl_hdr *)buf;
+        data = (struct portal_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
+                CERROR ("PORTALS: version mismatch kernel vs application\n");
+                return -EINVAL;
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
+                return -EINVAL;
+        }
+
+
+        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
+                CERROR ("PORTALS: user buffer too small for ioctl\n");
+                return -EINVAL;
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (portal_ioctl_is_invalid(data)) {
+                CERROR ("PORTALS: ioctl not correctly formatted\n");
+                return -EINVAL;
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+        }
+
+        EXIT;
+        return 0;
+}
+#endif
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_PORTAL_TYPE                   'e'
+#define IOC_PORTAL_MIN_NR                 30
+
+#define IOC_PORTAL_PING                    _IOWR('e', 30, long)
+#define IOC_PORTAL_GET_DEBUG               _IOWR('e', 31, long)
+#define IOC_PORTAL_CLEAR_DEBUG             _IOWR('e', 32, long)
+#define IOC_PORTAL_MARK_DEBUG              _IOWR('e', 33, long)
+#define IOC_PORTAL_PANIC                   _IOWR('e', 34, long)
+#define IOC_PORTAL_ADD_ROUTE               _IOWR('e', 35, long)
+#define IOC_PORTAL_DEL_ROUTE               _IOWR('e', 36, long)
+#define IOC_PORTAL_GET_ROUTE               _IOWR('e', 37, long)
+#define IOC_PORTAL_NAL_CMD                _IOWR('e', 38, long)
+#define IOC_PORTAL_GET_NID                 _IOWR('e', 39, long)
+#define IOC_PORTAL_FAIL_NID                _IOWR('e', 40, long)
+#define IOC_PORTAL_SET_DAEMON              _IOWR('e', 41, long)
+
+#define IOC_PORTAL_MAX_NR               41
+
+enum {
+        QSWNAL  =  1,
+        SOCKNAL,
+        GMNAL,
+        TOENAL,
+        TCPNAL,
+        SCIMACNAL,
+        NAL_ENUM_END_MARKER
+};
+
+#ifdef __KERNEL__
+extern ptl_handle_ni_t  kqswnal_ni;
+extern ptl_handle_ni_t  ksocknal_ni;
+extern ptl_handle_ni_t  ktoenal_ni;
+extern ptl_handle_ni_t  kgmnal_ni;
+extern ptl_handle_ni_t  kscimacnal_ni;
+#endif
+
+#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
+
+#define NAL_CMD_REGISTER_PEER_FD     100
+#define NAL_CMD_CLOSE_CONNECTION     101
+#define NAL_CMD_REGISTER_MYNID       102
+#define NAL_CMD_PUSH_CONNECTION      103
+
+enum {
+        DEBUG_DAEMON_START       =  1,
+        DEBUG_DAEMON_STOP        =  2,
+        DEBUG_DAEMON_PAUSE       =  3,
+        DEBUG_DAEMON_CONTINUE    =  4,
+};
+
+/* XXX remove to lustre ASAP */
+struct lustre_peer {
+        ptl_nid_t       peer_nid;
+        ptl_handle_ni_t peer_ni;
+};
+
+/* module.c */
+typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private);
+int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
+int kportal_nal_unregister(int nal);
+
+ptl_handle_ni_t *kportal_get_ni (int nal);
+void kportal_put_ni (int nal);
+
+#ifdef __CYGWIN__
+#ifndef BITS_PER_LONG
+#if (~0UL) == 0xffffffffUL
+#define BITS_PER_LONG 32
+#else 
+#define BITS_PER_LONG 64
+#endif
+#endif
+#endif
+
+#if (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%Lu"
+# define LPD64 "%Ld"
+# define LPX64 "%#Lx"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#if (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%lu"
+# define LPD64 "%ld"
+# define LPX64 "%#lx"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h
new file mode 100644 (file)
index 0000000..a528a80
--- /dev/null
@@ -0,0 +1,188 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _PORTALS_LIB_H
+#define _PORTALS_LIB_H
+
+#ifndef __KERNEL__
+# include <string.h>
+#else 
+# include <asm/types.h>
+#endif
+
+#undef MIN
+#define MIN(a,b) (((a)<(b)) ? (a): (b))
+#undef MAX
+#define MAX(a,b) (((a)>(b)) ? (a): (b))
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+
+static inline int size_round0(int val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t round_strlen(char *fset)
+{
+        return size_round(strlen(fset) + 1);
+}
+
+#ifdef __KERNEL__
+static inline char *strdup(const char *str)
+{
+        int len = strlen(str) + 1;
+        char *tmp = kmalloc(len, GFP_KERNEL);
+        if (tmp)
+                memcpy(tmp, str, len);
+
+        return tmp;
+}
+#endif
+
+#ifdef __KERNEL__
+# define NTOH__u32(var) le32_to_cpu(var)
+# define NTOH__u64(var) le64_to_cpu(var)
+# define HTON__u32(var) cpu_to_le32(var)
+# define HTON__u64(var) cpu_to_le64(var)
+#else
+# define expansion_u64(var) \
+    ({  __u64 ret; \
+       switch (sizeof(var)) {   \
+       case 8: (ret) = (var); break; \
+       case 4: (ret) = (__u32)(var); break; \
+       case 2: (ret) = (__u16)(var); break; \
+       case 1: (ret) = (__u8)(var); break; \
+       };       \
+       (ret);     \
+    })
+# define NTOH__u32(var) (var)
+# define NTOH__u64(var) (expansion_u64(var))
+# define HTON__u32(var) (var)
+# define HTON__u64(var) (expansion_u64(var))
+#endif
+
+/* 
+ * copy sizeof(type) bytes from pointer to var and move ptr forward.
+ * return EFAULT if pointer goes beyond end
+ */
+#define UNLOGV(var,type,ptr,end)                \
+do {                                            \
+        var = *(type *)ptr;                     \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* the following two macros convert to little endian */
+/* type MUST be __u32 or __u64 */
+#define LUNLOGV(var,type,ptr,end)               \
+do {                                            \
+        var = NTOH##type(*(type *)ptr);         \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* now log values */
+#define LOGV(var,type,ptr)                      \
+do {                                            \
+        *((type *)ptr) = var;                   \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* and in network order */
+#define LLOGV(var,type,ptr)                     \
+do {                                            \
+        *((type *)ptr) = HTON##type(var);       \
+        ptr += sizeof(type);                    \
+} while (0)
+
+
+/* 
+ * set var to point at (type *)ptr, move ptr forward with sizeof(type)
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGP(var,type,ptr,end)                \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define LOGP(var,type,ptr)                      \
+do {                                            \
+        memcpy(ptr, var, sizeof(type));         \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* 
+ * set var to point at (char *)ptr, move ptr forward by size_round(len);
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGL(var,type,len,ptr,end)            \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += size_round(len * sizeof(type));  \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define UNLOGL0(var,type,len,ptr,end)                                   \
+do {                                                                    \
+        UNLOGL(var,type,len,ptr,end);                                   \
+        if ( *((char *)ptr - size_round(len) + len - 1) != '\0')        \
+                return -EFAULT;                                         \
+} while (0)
+
+#define LOGL(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)ptr, (const char *)var, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGU(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)var, (const char *)ptr, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGL0(var,len,ptr)                              \
+do {                                                    \
+        if (!len)                                       \
+                break;                                  \
+        memcpy((char *)ptr, (const char *)var, len);    \
+        *((char *)(ptr) + len) = 0;                     \
+        ptr += size_round(len + 1);                     \
+} while (0)
+
+#endif /* _PORTALS_LIB_H */
diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am
new file mode 100644 (file)
index 0000000..c61b084
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = base
+include $(top_srcdir)/Rules
+
+pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h
+
diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h
new file mode 100644 (file)
index 0000000..af4a2dc
--- /dev/null
@@ -0,0 +1,27 @@
+# define DEBUG_SUBSYSTEM S_PORTALS
+# define PORTAL_DEBUG
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+
+#include <portals/internal.h>
+#include <portals/nal.h>
+#include <portals/arg-blocks.h>
+
+/* Hack for 2.4.18 macro name collision */
+#ifdef yield
+#undef yield
+#endif
diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h
new file mode 100644 (file)
index 0000000..a83749b
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef P30_API_H
+#define P30_API_H
+
+#include <portals/types.h>
+
+#ifndef PTL_NO_WRAP
+int PtlInit(void);
+int PtlInitialized(void);
+void PtlFini(void);
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in,
+              ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * interface_out);
+
+int PtlNIInitialized(ptl_interface_t);
+
+int PtlNIFini(ptl_handle_ni_t interface_in);
+
+#endif
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
+
+
+/*
+ * Network interfaces
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlNIBarrier(ptl_handle_ni_t interface_in);
+#endif
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out);
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out);
+
+#ifndef PTL_NO_WRAP
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
+#endif
+
+
+/*
+ * PtlNIDebug: 
+ *
+ * This is not an official Portals 3 API call.  It is provided
+ * by the reference implementation to allow the maintainers an
+ * easy way to turn on and off debugging information in the
+ * library.  Do not use it in code that is not intended for use
+ * with any version other than the portable reference library.
+ */
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
+
+/* 
+ * PtlNIFailNid
+ *
+ * Not an official Portals 3 API call.  It provides a way of simulating
+ * communications failures to all (nid == PTL_NID_ANY), or specific peers
+ * (via multiple calls), either until further notice (threshold == -1), or
+ * for a specific number of messages.  Passing a threshold of zero, "heals"
+ * the given peer.
+ */
+int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
+
+
+/*
+ * Match entries
+ */
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out);
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out);
+
+int PtlMEUnlink(ptl_handle_me_t current_in);
+
+int PtlMEUnlinkList(ptl_handle_me_t current_in);
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in);
+int PtlMEDump(ptl_handle_me_t current_in);
+
+
+
+/*
+ * Memory descriptors
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+              ptl_handle_md_t * handle_out);
+
+int PtlMDUnlink(ptl_handle_md_t md_in);
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
+
+#endif
+
+/* These should not be called by users */
+int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                         ptl_md_t * new_inout, ptl_handle_eq_t testq_in,
+                         ptl_seq_t sequence_in);
+
+
+
+
+/*
+ * Event queues
+ */
+#ifndef PTL_NO_WRAP
+
+/* These should be called by users */
+int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out);
+int PtlEQFree(ptl_handle_eq_t eventq_in);
+
+int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out);
+
+int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout);
+#endif
+
+/*
+ * Access Control Table
+ */
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in);
+
+
+/*
+ * Data movement
+ */
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in);
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in);
+
+
+
+#endif
diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h
new file mode 100644 (file)
index 0000000..3c3b154
--- /dev/null
@@ -0,0 +1,265 @@
+#ifndef PTL_BLOCKS_H
+#define PTL_BLOCKS_H
+
+/*
+ * blocks.h
+ *
+ * Argument block types for the Portals 3.0 library
+ * Generated by idl
+ *
+ */
+
+#include <portals/types.h>
+
+/* put LIB_MAX_DISPATCH last here  -- these must match the
+   assignements to the dispatch table in lib-p30/dispatch.c */
+#define PTL_GETID     1
+#define PTL_NISTATUS  2
+#define PTL_NIDIST    3
+#define PTL_NIDEBUG   4
+#define PTL_MEATTACH  5
+#define PTL_MEINSERT  6
+// #define PTL_MEPREPEND 7
+#define PTL_MEUNLINK  8
+#define PTL_TBLDUMP   9 
+#define PTL_MEDUMP   10
+#define PTL_MDATTACH 11
+// #define PTL_MDINSERT 12
+#define PTL_MDBIND   13
+#define PTL_MDUPDATE 14
+#define PTL_MDUNLINK 15
+#define PTL_EQALLOC  16
+#define PTL_EQFREE   17
+#define PTL_ACENTRY  18
+#define PTL_PUT      19 
+#define PTL_GET      20
+#define PTL_FAILNID  21
+#define LIB_MAX_DISPATCH 21
+
+typedef struct PtlFailNid_in {
+       ptl_handle_ni_t interface;
+       ptl_nid_t       nid;
+       unsigned int    threshold;
+} PtlFailNid_in;
+
+typedef struct PtlFailNid_out {
+       int             rc;
+} PtlFailNid_out;
+
+typedef struct PtlGetId_in {
+        ptl_handle_ni_t handle_in;
+} PtlGetId_in;
+
+typedef struct PtlGetId_out {
+        int rc;
+        ptl_process_id_t id_out;
+} PtlGetId_out;
+
+typedef struct PtlNIStatus_in {
+        ptl_handle_ni_t interface_in;
+        ptl_sr_index_t register_in;
+} PtlNIStatus_in;
+
+typedef struct PtlNIStatus_out {
+        int rc;
+        ptl_sr_value_t status_out;
+} PtlNIStatus_out;
+
+
+typedef struct PtlNIDist_in {
+        ptl_handle_ni_t interface_in;
+        ptl_process_id_t process_in;
+} PtlNIDist_in;
+
+typedef struct PtlNIDist_out {
+        int rc;
+        unsigned long distance_out;
+} PtlNIDist_out;
+
+
+typedef struct PtlNIDebug_in {
+        unsigned int mask_in;
+} PtlNIDebug_in;
+
+typedef struct PtlNIDebug_out {
+        unsigned int rc;
+} PtlNIDebug_out;
+
+
+typedef struct PtlMEAttach_in {
+        ptl_handle_ni_t interface_in;
+        ptl_pt_index_t index_in;
+        ptl_ins_pos_t position_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+} PtlMEAttach_in;
+
+typedef struct PtlMEAttach_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEAttach_out;
+
+
+typedef struct PtlMEInsert_in {
+        ptl_handle_me_t current_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+        ptl_ins_pos_t position_in;
+} PtlMEInsert_in;
+
+typedef struct PtlMEInsert_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEInsert_out;
+
+typedef struct PtlMEUnlink_in {
+        ptl_handle_me_t current_in;
+        ptl_unlink_t unlink_in;
+} PtlMEUnlink_in;
+
+typedef struct PtlMEUnlink_out {
+        int rc;
+} PtlMEUnlink_out;
+
+
+typedef struct PtlTblDump_in {
+        int index_in;
+} PtlTblDump_in;
+
+typedef struct PtlTblDump_out {
+        int rc;
+} PtlTblDump_out;
+
+
+typedef struct PtlMEDump_in {
+        ptl_handle_me_t current_in;
+} PtlMEDump_in;
+
+typedef struct PtlMEDump_out {
+        int rc;
+} PtlMEDump_out;
+
+
+typedef struct PtlMDAttach_in {
+        ptl_handle_me_t me_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+        ptl_unlink_t unlink_in;
+} PtlMDAttach_in;
+
+typedef struct PtlMDAttach_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDAttach_out;
+
+
+typedef struct PtlMDBind_in {
+        ptl_handle_ni_t ni_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+} PtlMDBind_in;
+
+typedef struct PtlMDBind_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDBind_out;
+
+
+typedef struct PtlMDUpdate_internal_in {
+        ptl_handle_md_t md_in;
+        ptl_handle_eq_t testq_in;
+        ptl_seq_t sequence_in;
+
+        ptl_md_t old_inout;
+        int old_inout_valid;
+        ptl_md_t new_inout;
+        int new_inout_valid;
+} PtlMDUpdate_internal_in;
+
+typedef struct PtlMDUpdate_internal_out {
+        int rc;
+        ptl_md_t old_inout;
+        ptl_md_t new_inout;
+} PtlMDUpdate_internal_out;
+
+
+typedef struct PtlMDUnlink_in {
+        ptl_handle_md_t md_in;
+} PtlMDUnlink_in;
+
+typedef struct PtlMDUnlink_out {
+        int rc;
+        ptl_md_t status_out;
+} PtlMDUnlink_out;
+
+
+typedef struct PtlEQAlloc_in {
+        ptl_handle_ni_t ni_in;
+        ptl_size_t count_in;
+        void *base_in;
+        int len_in;
+        int (*callback_in) (ptl_event_t * event);
+} PtlEQAlloc_in;
+
+typedef struct PtlEQAlloc_out {
+        int rc;
+        ptl_handle_eq_t handle_out;
+} PtlEQAlloc_out;
+
+
+typedef struct PtlEQFree_in {
+        ptl_handle_eq_t eventq_in;
+} PtlEQFree_in;
+
+typedef struct PtlEQFree_out {
+        int rc;
+} PtlEQFree_out;
+
+
+typedef struct PtlACEntry_in {
+        ptl_handle_ni_t ni_in;
+        ptl_ac_index_t index_in;
+        ptl_process_id_t match_id_in;
+        ptl_pt_index_t portal_in;
+} PtlACEntry_in;
+
+typedef struct PtlACEntry_out {
+        int rc;
+} PtlACEntry_out;
+
+
+typedef struct PtlPut_in {
+        ptl_handle_md_t md_in;
+        ptl_ack_req_t ack_req_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+        ptl_hdr_data_t hdr_data_in;
+} PtlPut_in;
+
+typedef struct PtlPut_out {
+        int rc;
+} PtlPut_out;
+
+
+typedef struct PtlGet_in {
+        ptl_handle_md_t md_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+} PtlGet_in;
+
+typedef struct PtlGet_out {
+        int rc;
+} PtlGet_out;
+
+
+#endif
diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h
new file mode 100644 (file)
index 0000000..285f7e0
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+** $Id: defines.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+**
+** This files contains definitions that are used throughout the cplant code.
+*/
+
+#ifndef CPLANT_H
+#define CPLANT_H
+
+#define TITLE(fname,zmig)
+
+
+/*
+** TRUE and FALSE
+*/
+#undef TRUE
+#define TRUE           (1)
+#undef FALSE
+#define FALSE          (0)
+
+
+/*
+** Return codes from functions
+*/
+#undef OK
+#define OK             (0)
+#undef ERROR
+#define ERROR          (-1)
+
+
+
+/*
+** The GCC macro for a safe max() that works on all types arithmetic types.
+*/
+#ifndef MAX
+#define MAX(a, b)      (a) > (b) ? (a) : (b)
+#endif /* MAX */
+
+#ifndef MIN
+#define MIN(a, b)      (a) < (b) ? (a) : (b)
+#endif /* MIN */
+
+/*
+** The rest is from the old qkdefs.h
+*/
+
+#ifndef __linux__
+#define __inline__
+#endif
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#ifndef __osf__
+#define PRIVATE static
+#define PUBLIC
+#endif
+
+#ifndef __osf__
+typedef unsigned char           uchar;
+#endif
+
+typedef char                    CHAR;
+typedef unsigned char           UCHAR;
+typedef char                    INT8;
+typedef unsigned char           UINT8;
+typedef short int               INT16;
+typedef unsigned short int      UINT16;
+typedef int                     INT32;
+typedef unsigned int            UINT32;
+typedef long                    LONG32;
+typedef unsigned long           ULONG32;
+
+/* long may be 32 or 64, so we can't really append the size to the definition */
+typedef long                    LONG;
+typedef unsigned long           ULONG;
+
+#ifdef __alpha__
+typedef long int_t;
+#ifndef __osf__
+typedef unsigned long uint_t;
+#endif
+#endif
+
+#ifdef __i386__
+typedef int int_t;
+typedef unsigned int uint_t;
+#endif
+
+typedef float                   FLOAT32;
+typedef double                  FLOAT64;
+typedef void                    VOID;
+typedef INT32                   BOOLEAN;
+typedef void (*FCN_PTR)(void);
+
+#ifndef off64_t
+
+#if defined (__alpha__) || defined (__ia64__)
+typedef long                     off64_t;
+#else
+typedef long long                off64_t;
+#endif
+
+#endif
+
+/*
+** Process related typedefs
+*/
+typedef UINT16 PID_TYPE;  /* Type of Local process ID */
+typedef UINT16 NID_TYPE;  /* Type of Physical node ID */
+typedef UINT16 GID_TYPE;  /* Type of Group ID */
+typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */
+
+
+
+#endif /* CPLANT_H */
diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h
new file mode 100644 (file)
index 0000000..817936a
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _P30_ERRNO_H_
+#define _P30_ERRNO_H_
+
+/*
+ * include/portals/errno.h
+ *
+ * Shared error number lists
+ */
+
+/* If you change these, you must update the string table in api-errno.c */
+typedef enum {
+        PTL_OK              = 0,
+        PTL_SEGV            = 1,
+
+        PTL_NOSPACE         = 2,
+        PTL_INUSE           = 3,
+        PTL_VAL_FAILED      = 4,
+
+        PTL_NAL_FAILED      = 5,
+        PTL_NOINIT          = 6,
+        PTL_INIT_DUP        = 7,
+        PTL_INIT_INV        = 8,
+        PTL_AC_INV_INDEX    = 9,
+
+        PTL_INV_ASIZE       = 10,
+        PTL_INV_HANDLE      = 11,
+        PTL_INV_MD          = 12,
+        PTL_INV_ME          = 13,
+        PTL_INV_NI          = 14,
+/* If you change these, you must update the string table in api-errno.c */
+        PTL_ILL_MD          = 15,
+        PTL_INV_PROC        = 16,
+        PTL_INV_PSIZE       = 17,
+        PTL_INV_PTINDEX     = 18,
+        PTL_INV_REG         = 19,
+
+        PTL_INV_SR_INDX     = 20,
+        PTL_ML_TOOLONG      = 21,
+        PTL_ADDR_UNKNOWN    = 22,
+        PTL_INV_EQ          = 23,
+        PTL_EQ_DROPPED      = 24,
+
+        PTL_EQ_EMPTY        = 25,
+        PTL_NOUPDATE        = 26,
+        PTL_FAIL            = 27,
+        PTL_NOT_IMPLEMENTED = 28,
+        PTL_NO_ACK          = 29,
+
+        PTL_IOV_TOO_MANY    = 30,
+        PTL_IOV_TOO_SMALL   = 31,
+
+       PTL_EQ_INUSE        = 32,
+       PTL_MD_INUSE        = 33,
+
+        PTL_MAX_ERRNO       = 33
+} ptl_err_t;
+/* If you change these, you must update the string table in api-errno.c */
+
+extern const char *ptl_err_str[];
+
+#endif
diff --git a/lustre/portals/include/portals/internal.h b/lustre/portals/include/portals/internal.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h
new file mode 100644 (file)
index 0000000..7e5d73d
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef PTL_DISPATCH_H
+#define PTL_DISPATCH_H
+
+/*
+ * include/dispatch.h
+ *
+ * Dispatch table header and externs for remote side
+ * operations
+ *
+ * Generated by idl
+ *
+ */
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args,
+                           void *ret);
+extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args,
+                                  void *ret);
+extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlACEntry(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret);
+
+extern char *dispatch_name(int index);
+#endif
diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h
new file mode 100644 (file)
index 0000000..4052c0c
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef _LIB_NAL_H_
+#define _LIB_NAL_H_
+
+/*
+ * nal.h
+ *
+ * Library side headers that define the abstraction layer's
+ * responsibilities and interfaces
+ */
+
+#include <portals/lib-types.h>
+
+struct nal_cb_t {
+       /*
+        * Per interface portal table, access control table
+        * and NAL private data field;
+        */
+       lib_ni_t ni;
+       void *nal_data;
+       /*
+        * send:  Sends a preformatted header and user data to a
+        * specified remote process.
+        * Can overwrite iov.
+        */
+       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                       unsigned int niov, struct iovec *iov, size_t mlen);
+
+       /* as send, but with a set of page fragments (NULL if not supported) */
+       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       /*
+        * recv: Receives an incoming message from a remote process
+        * Type of iov depends on options.  Can overwrite iov.
+        */
+       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                       unsigned int niov, struct iovec *iov, size_t mlen, 
+                       size_t rlen);
+
+       /* as recv, but with a set of page fragments (NULL if not supported) */
+       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen, 
+                             size_t rlen);
+       /*
+        * read: Reads a block of data from a specified user address
+        */
+       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                       user_ptr src_addr, size_t len);
+
+       /*
+        * write: Writes a block of data into a specified user address
+        */
+       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                        void *src_addr, size_t len);
+
+       /*
+        * callback: Calls an event callback
+        */
+       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev);
+
+       /*
+        *  malloc: Acquire a block of memory in a system independent
+        * fashion.
+        */
+       void *(*cb_malloc) (nal_cb_t * nal, size_t len);
+
+       void (*cb_free) (nal_cb_t * nal, void *buf, size_t len);
+
+       /*
+        * (un)map: Tell the NAL about some memory it will access.
+        * *addrkey passed to cb_unmap() is what cb_map() set it to.
+        * type of *iov depends on options.
+        * Set to NULL if not required.
+        */
+       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                      void **addrkey);
+       void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                         void **addrkey);
+
+       /* as (un)map, but with a set of page fragments */
+       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                            void **addrkey);
+       void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                         void **addrkey);
+
+       void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...);
+
+       /* Turn interrupts off (begin of protected area) */
+       void (*cb_cli) (nal_cb_t * nal, unsigned long *flags);
+
+       /* Turn interrupts on (end of protected area) */
+       void (*cb_sti) (nal_cb_t * nal, unsigned long *flags);
+
+       /*
+        * Calculate a network "distance" to given node
+        */
+       int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist);
+};
+
+#endif
diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h
new file mode 100644 (file)
index 0000000..ec3393b
--- /dev/null
@@ -0,0 +1,383 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab; 
+extern kmem_cache_t *ptl_msg_slab; 
+extern kmem_cache_t *ptl_me_slab; 
+extern kmem_cache_t *ptl_eq_slab; 
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_KERNEL);
+        
+        if (eq == NULL)
+                return (NULL);
+        
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void 
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq); 
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_KERNEL); 
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_KERNEL);
+
+        if (me == NULL)
+                return (NULL);
+        
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); 
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie);
+        
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h
new file mode 100644 (file)
index 0000000..08ea118
--- /dev/null
@@ -0,0 +1,273 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * p30/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef _LIB_TYPES_H_
+#define _LIB_TYPES_H_
+
+#include <portals/types.h>
+#ifdef __KERNEL__
+# define PTL_USE_SLAB_CACHE
+# include <linux/uio.h>
+# include <linux/smp_lock.h>
+# include <linux/types.h>
+#else
+# include <sys/types.h>
+#endif
+
+/* struct nal_cb_t is defined in lib-nal.h */
+typedef struct nal_cb_t nal_cb_t;
+
+typedef char *user_ptr;
+typedef struct lib_msg_t lib_msg_t;
+typedef struct lib_ptl_t lib_ptl_t;
+typedef struct lib_ac_t lib_ac_t;
+typedef struct lib_me_t lib_me_t;
+typedef struct lib_md_t lib_md_t;
+typedef struct lib_eq_t lib_eq_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+        __u64 wh_interface_cookie;
+        __u64 wh_object_cookie;
+} ptl_handle_wire_t;
+
+/* byte-flip insensitive! */
+#define PTL_WIRE_HANDLE_NONE \
+((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
+
+typedef enum {
+        PTL_MSG_ACK = 0,
+        PTL_MSG_PUT,
+        PTL_MSG_GET,
+        PTL_MSG_REPLY,
+        PTL_MSG_HELLO,
+} ptl_msg_type_t;
+
+/* Each of these structs should start with an odd number of
+ * __u32, or the compiler could add its own padding and confuse
+ * everyone.
+ *
+ * Also, "length" needs to be at offset 28 of each struct.
+ */
+typedef struct ptl_ack {
+        ptl_size_t mlength;
+        ptl_handle_wire_t dst_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for acks) moving out RSN */
+} ptl_ack_t;
+
+typedef struct ptl_put {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t ack_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length moving out RSN */
+        ptl_size_t offset;
+        ptl_hdr_data_t hdr_data;
+} ptl_put_t;
+
+typedef struct ptl_get {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t return_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for gets) moving out RSN */
+        ptl_size_t src_offset;
+        ptl_size_t return_offset;               /* unused: going RSN */
+        ptl_size_t sink_length;
+} ptl_get_t;
+
+typedef struct ptl_reply {
+        __u32 unused1;                          /* unused fields going RSN */
+        ptl_handle_wire_t dst_wmd;
+        ptl_size_t dst_offset;                  /* unused: going RSN */
+        __u32 unused2;
+        ptl_size_t length;                      /* common length moving out RSN */
+} ptl_reply_t;
+
+typedef struct {
+        ptl_nid_t dest_nid;
+        ptl_nid_t src_nid;
+        ptl_pid_t dest_pid;
+        ptl_pid_t src_pid;
+        __u32 type; /* ptl_msg_type_t */
+        union {
+                ptl_ack_t ack;
+                ptl_put_t put;
+                ptl_get_t get;
+                ptl_reply_t reply;
+        } msg;
+} ptl_hdr_t;
+
+/* All length fields in individual unions at same offset */
+/* LASSERT for same in lib-move.c */
+#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length)
+
+/* A HELLO message contains the portals magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * PTL_MSG_HELLO in the type field.  All other fields are zero (including
+ * PTL_HDR_LENGTH; i.e. no payload).
+ * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID, so that hosts with
+ * multiple IP interfaces can have a single NID. These NALs should exchange
+ * HELLO messages when a connection is first established. */
+typedef struct {
+        __u32  magic;                          /* PORTALS_PROTO_MAGIC */
+        __u16   version_major;                  /* increment on incompatible change */
+        __u16   version_minor;                  /* increment on compatible change */
+} ptl_magicversion_t;
+
+#define PORTALS_PROTO_MAGIC                0xeebc0ded
+
+#define PORTALS_PROTO_VERSION_MAJOR        0
+#define PORTALS_PROTO_VERSION_MINOR        1
+
+typedef struct {
+        long recv_count, recv_length, send_count, send_length, drop_count,
+            drop_length, msgs_alloc, msgs_max;
+} lib_counters_t;
+
+/* temporary expedient: limit number of entries in discontiguous MDs */
+#if PTL_LARGE_MTU
+# define PTL_MD_MAX_IOV        64
+#else
+# define PTL_MD_MAX_IOV 16
+#endif
+
+struct lib_msg_t {
+        struct list_head  msg_list;
+        int               send_ack;
+        lib_md_t         *md;
+        ptl_nid_t         nid;
+        ptl_pid_t         pid;
+        ptl_event_t       ev;
+        ptl_handle_wire_t ack_wmd;
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } msg_iov;
+};
+
+struct lib_ptl_t {
+        ptl_pt_index_t size;
+        struct list_head *tbl;
+};
+
+struct lib_ac_t {
+        int next_free;
+};
+
+typedef struct {
+        struct list_head  lh_hash_chain;
+        __u64             lh_cookie;
+} lib_handle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+struct lib_eq_t {
+        struct list_head  eq_list;
+        lib_handle_t      eq_lh;
+        ptl_seq_t         sequence;
+        ptl_size_t        size;
+        ptl_event_t      *base;
+        int               eq_refcount;
+        int (*event_callback) (ptl_event_t * event);
+        void             *eq_addrkey;
+};
+
+struct lib_me_t {
+        struct list_head  me_list;
+        lib_handle_t      me_lh;
+        ptl_process_id_t  match_id;
+        ptl_match_bits_t  match_bits, ignore_bits;
+        ptl_unlink_t      unlink;
+        lib_md_t         *md;
+};
+
+struct lib_md_t {
+        struct list_head  md_list;
+        lib_handle_t      md_lh;
+        lib_me_t         *me;
+        user_ptr          start;
+        ptl_size_t        offset;
+        ptl_size_t        length;
+        ptl_size_t        max_size;
+        int               threshold;
+        int               pending;
+        ptl_unlink_t      unlink;
+        unsigned int      options;
+        unsigned int      md_flags;
+        void             *user_ptr;
+        lib_eq_t         *eq;
+        void             *md_addrkey;
+        unsigned int      md_niov;                /* # frags */
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } md_iov;
+};
+
+#define PTL_MD_FLAG_UNLINK            (1 << 0)
+#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
+
+#ifndef PTL_USE_SLAB_CACHE
+typedef struct
+{
+        void             *fl_objs;             /* single contiguous array of objects */
+        int                fl_nobjs;            /* the number of them */
+        int                fl_objsize;          /* the size (including overhead) of each of them */
+        struct list_head   fl_list;             /* where they are enqueued */
+} lib_freelist_t;
+
+typedef struct
+{
+        struct list_head   fo_list;             /* enqueue on fl_list */
+        void              *fo_contents;         /* aligned contents */
+} lib_freeobj_t;
+#endif
+
+typedef struct {
+        /* info about peers we are trying to fail */
+        struct list_head  tp_list;             /* stash in ni.ni_test_peers */
+        ptl_nid_t         tp_nid;              /* matching nid */
+        unsigned int      tp_threshold;        /* # failures to simulate */
+} lib_test_peer_t;
+
+typedef struct {
+        int up;
+        int refcnt;
+        ptl_nid_t nid;
+        ptl_pid_t pid;
+        int num_nodes;
+        unsigned int debug;
+        lib_ptl_t tbl;
+        lib_ac_t ac;
+        lib_counters_t counters;
+
+        int               ni_lh_hash_size;      /* size of lib handle hash table */
+        struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
+        __u64             ni_next_object_cookie; /* cookie generator */
+        __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
+        
+        struct list_head ni_test_peers;
+        
+#ifndef PTL_USE_SLAB_CACHE
+        lib_freelist_t   ni_free_mes;
+        lib_freelist_t   ni_free_msgs;
+        lib_freelist_t   ni_free_mds;
+        lib_freelist_t   ni_free_eqs;
+#endif
+        struct list_head ni_active_msgs;
+        struct list_head ni_active_mds;
+        struct list_head ni_active_eqs;
+} lib_ni_t;
+
+#endif
diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h
new file mode 100644 (file)
index 0000000..41613ab
--- /dev/null
@@ -0,0 +1,246 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+                             struct list_head * prev,
+                             struct list_head * next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+               pos = pos->next, prefetch(pos->next))
+
+/**
+ * list_for_each_prev  -       iterate over a list in reverse order
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
+               pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+#endif
+
+#ifndef list_for_each_entry
+/**
+ * list_for_each_entry  -       iterate over list of given type
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    prefetch(pos->member.next);                        \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.next, typeof(*pos), member),  \
+            prefetch(pos->member.next))
+#endif
+
+#ifndef list_for_each_entry_safe
+/**
+ * list_for_each_entry_safe  -       iterate over list of given type safe against removal of list entry
+ * @pos:        the type * to use as a loop counter.
+ * @n:          the &struct list_head to use as temporary storage
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    n = pos->member.next;                              \
+            &pos->member != (head);                                    \
+            pos = list_entry(n, typeof(*pos), member),                 \
+            n = pos->member.next)
+#endif
diff --git a/lustre/portals/include/portals/lltrace.h b/lustre/portals/include/portals/lltrace.h
new file mode 100644 (file)
index 0000000..7d1b304
--- /dev/null
@@ -0,0 +1,175 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Compile with:
+ * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl 
+ */
+#ifndef __LTRACE_H_
+#define __LTRACE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <portals/types.h>
+#include <portals/ptlctl.h>
+#include <linux/kp30.h>
+#include <linux/limits.h>
+#include <asm/page.h>
+#include <linux/version.h>
+
+static inline int ltrace_write_file(char* fname)
+{
+        char* argv[3];
+
+        argv[0] = "debug_kernel";
+        argv[1] = fname;
+        argv[2] = "1";
+        
+        fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]);
+        
+        return jt_dbg_debug_kernel(3, argv);
+}
+
+static inline int ltrace_clear()
+{
+        char* argv[1];
+        
+        argv[0] = "clear";
+        
+        fprintf(stderr, "[ptlctl] %s\n", argv[0]);
+        
+        return jt_dbg_clear_debug_buf(1, argv);
+}
+
+static inline int ltrace_mark(int indent_level, char* text)
+{
+        char* argv[2];
+        char mark_buf[PATH_MAX];
+        
+        snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text);
+        
+        argv[0] = "mark";
+        argv[1] = mark_buf;
+        return jt_dbg_mark_debug_buf(2, argv);
+}
+
+static inline int ltrace_applymasks()
+{
+        char* argv[2];
+        argv[0] = "list";
+        argv[1] = "applymasks";
+        
+        fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]);
+        
+        return jt_dbg_list(2, argv);
+}
+
+
+static inline int ltrace_filter(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "filter";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_filter(2, argv);
+}
+
+static inline int ltrace_show(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "show";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_show(2, argv);
+}
+
+static inline int ltrace_start()
+{
+        int rc = 0;
+        dbg_initialize(0, NULL);
+#ifdef PORTALS_DEV_ID
+        rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+#endif
+        ltrace_filter("class"); 
+        ltrace_filter("socknal");
+        ltrace_filter("qswnal"); 
+        ltrace_filter("gmnal");  
+        ltrace_filter("portals");  
+        
+        ltrace_show("all_types");  
+        ltrace_filter("trace");  
+        ltrace_filter("malloc"); 
+        ltrace_filter("net"); 
+        ltrace_filter("page"); 
+        ltrace_filter("other"); 
+        ltrace_filter("info"); 
+        ltrace_applymasks();
+
+        return rc;
+}
+
+
+static inline void ltrace_stop()
+{
+#ifdef PORTALS_DEV_ID
+        unregister_ioc_dev(PORTALS_DEV_ID);
+#endif
+}
+
+static inline int not_uml()
+{
+  /* Return Values:
+   *   0 when run under UML
+   *   1 when run on host
+   *  <0 when lookup failed
+   */
+       struct stat buf;
+       int rc = stat("/dev/ubd", &buf);
+       rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
+       if (rc<0) {
+         fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
+         rc = 1; /* Assume host */
+       }
+       return rc;
+}
+
+#define LTRACE_MAX_NOB   256
+static inline void ltrace_add_processnames(char* fname)
+{
+        char cmdbuf[LTRACE_MAX_NOB];
+        struct timeval tv;
+        struct timezone tz;
+        int nob;
+        int underuml = !not_uml();
+        
+        gettimeofday(&tv, &tz);
+
+        nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \"");
+
+        /* Careful - these format strings need to match the CDEBUG
+         * formats in portals/linux/debug.c EXACTLY
+         */
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ",
+                        S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec);
+
+        if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d | %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L);
+        }
+        else {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0L);
+        }
+         
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname);
+        system(cmdbuf);
+}
+
+#endif
diff --git a/lustre/portals/include/portals/myrnal.h b/lustre/portals/include/portals/myrnal.h
new file mode 100644 (file)
index 0000000..6a61fd5
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+** $Id: myrnal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+
+#ifndef MYRNAL_H
+#define MYRNAL_H
+
+#define MAX_ARGS_LEN            (256)
+#define MAX_RET_LEN             (128)
+#define MYRNAL_MAX_ACL_SIZE     (64)
+#define MYRNAL_MAX_PTL_SIZE     (64)
+
+#define P3CMD                   (100)
+#define P3SYSCALL               (200)
+#define P3REGISTER              (300)
+
+enum { PTL_MLOCKALL };
+
+typedef struct {
+       void *args;
+       size_t args_len;
+       void *ret;
+       size_t ret_len;
+       int p3cmd;
+} myrnal_forward_t;
+
+#endif                         /* MYRNAL_H */
diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h
new file mode 100644 (file)
index 0000000..c1c50ed
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+** $Id: nal.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+#ifndef _NAL_H_
+#define _NAL_H_
+
+/*
+ * p30/nal.h
+ *
+ * The API side NAL declarations
+ */
+
+#include <portals/types.h>
+
+#ifdef yield
+#undef yield
+#endif
+
+typedef struct nal_t nal_t;
+
+struct nal_t {
+       ptl_ni_t ni;
+       int refct;
+       void *nal_data;
+       int *timeout;           /* for libp30api users */
+       int (*forward) (nal_t * nal, int index, /* Function ID */
+                       void *args, size_t arg_len, void *ret, size_t ret_len);
+
+       int (*shutdown) (nal_t * nal, int interface);
+
+       int (*validate) (nal_t * nal, void *base, size_t extent);
+
+       void (*yield) (nal_t * nal);
+
+       void (*lock) (nal_t * nal, unsigned long *flags);
+
+       void (*unlock) (nal_t * nal, unsigned long *flags);
+};
+
+typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+
+extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any);
+
+#ifndef PTL_IFACE_DEFAULT
+#define PTL_IFACE_DEFAULT (PTL_IFACE_IP)
+#endif
+
+#endif
diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h
new file mode 100644 (file)
index 0000000..1b837b4
--- /dev/null
@@ -0,0 +1,4 @@
+#define PTL_IFACE_TCP 1
+#define PTL_IFACE_ER 2
+#define PTL_IFACE_SS 3
+#define PTL_IFACE_MAX 4
diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h
new file mode 100644 (file)
index 0000000..34e5dc5
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * TITLE(ppid_h, "@(#) $Id: ppid.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $");
+ */
+
+#ifndef _INCppidh_
+#define _INCppidh_
+
+#include "defines.h"
+// #include "idtypes.h"
+
+
+#define MAX_PPID         1000    /* this needs to fit into 16 bits so the 
+                                    maximum value is 65535. having it "large"
+                                    can help w/ debugging process accounting
+                                    but there are reasons for making it 
+                                    somewhat smaller than the maximum --
+                                    requiring storage for arrays that index 
+                                    on the ppid, eg...  */
+                                 
+#define MAX_GID          1000    /* this needs to fit into 16 bits... */
+
+#define MAX_FIXED_PPID   100
+#define MAX_FIXED_GID    100
+#define PPID_FLOATING    MAX_FIXED_PPID+1   /* Floating area starts here */
+#define GID_FLOATING     MAX_FIXED_GID+1    /* Floating area starts here */
+#define NUM_PTL_TASKS    MAX_FIXED_PPID+80  /* Maximum no. portals tasks */
+
+#define PPID_AUTO        0
+
+/* Minimum PPID is 1 */
+#define PPID_BEBOPD      1            /* bebopd */
+#define  GID_BEBOPD      1            /* bebopd */
+
+#define PPID_PCT         2            /* pct */
+#define  GID_PCT         2            /* pct */
+
+#define PPID_FYOD        3            /* fyod */
+#define  GID_FYOD        3            /* fyod */
+
+#define PPID_GDBWRAP     11           /* portals proxy for gdb */
+#define  GID_GDBWRAP     11           /* portals proxy for gdb */
+
+#define PPID_TEST        15           /* for portals tests */
+#define  GID_TEST        15
+
+#define  GID_YOD         5            /* yod */
+#define  GID_PINGD       6            /* pingd */
+#define  GID_BT          7            /* bt */
+#define  GID_PTLTEST     8            /* ptltest */
+#define  GID_CGDB        9            /* cgdb */
+#define  GID_TVDSVR     10            /* start-tvdsvr */
+
+#endif /* _INCppidh_ */
diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h
new file mode 100644 (file)
index 0000000..fdaae69
--- /dev/null
@@ -0,0 +1,74 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lustre/portals/include/portals/stringtab.h b/lustre/portals/include/portals/stringtab.h
new file mode 100644 (file)
index 0000000..65ab189
--- /dev/null
@@ -0,0 +1,6 @@
+/*
+** $Id: stringtab.h,v 1.1.2.1 2003/05/19 04:25:31 braam Exp $
+*/
+/*
+ * stringtab.h
+ */
diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h
new file mode 100644 (file)
index 0000000..d4038b6
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef _P30_TYPES_H_
+#define _P30_TYPES_H_
+
+#ifdef __linux__
+#include <asm/types.h>
+#include <asm/timex.h>
+#else
+#include <sys/types.h>
+typedef u_int32_t __u32;
+typedef u_int64_t __u64;
+typedef unsigned long long cycles_t;
+static inline cycles_t get_cycles(void) { return 0; }
+#endif
+
+typedef __u64 ptl_nid_t;
+typedef __u32 ptl_pid_t;
+typedef __u32 ptl_pt_index_t;
+typedef __u32 ptl_ac_index_t;
+typedef __u64 ptl_match_bits_t;
+typedef __u64 ptl_hdr_data_t;
+typedef __u32 ptl_size_t;
+
+typedef struct {
+        unsigned long nal_idx;                 /* which network interface */
+        __u64         cookie;                  /* which thing on that interface */
+} ptl_handle_any_t;
+
+typedef ptl_handle_any_t ptl_handle_ni_t;
+typedef ptl_handle_any_t ptl_handle_eq_t;
+typedef ptl_handle_any_t ptl_handle_md_t;
+typedef ptl_handle_any_t ptl_handle_me_t;
+
+#define PTL_HANDLE_NONE \
+((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
+#define PTL_EQ_NONE PTL_HANDLE_NONE
+
+static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+{
+       return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
+}
+
+#define PTL_NID_ANY      ((ptl_nid_t) -1)
+#define PTL_PID_ANY      ((ptl_pid_t) -1)
+
+typedef struct {
+        ptl_nid_t nid;
+        ptl_pid_t pid;   /* node id / process id */
+} ptl_process_id_t;
+
+typedef enum {
+        PTL_RETAIN = 0,
+        PTL_UNLINK
+} ptl_unlink_t;
+
+typedef enum {
+        PTL_INS_BEFORE,
+        PTL_INS_AFTER
+} ptl_ins_pos_t;
+
+typedef struct {
+       struct page     *kiov_page;
+       unsigned int     kiov_len;
+       unsigned int     kiov_offset;
+} ptl_kiov_t;
+
+typedef struct {
+        void            *start;
+        ptl_size_t       length;
+        int              threshold;
+        int              max_size;
+        unsigned int     options;
+        void            *user_ptr;
+        ptl_handle_eq_t  eventq;
+       unsigned int     niov;
+} ptl_md_t;
+
+/* Options for the MD structure */
+#define PTL_MD_OP_PUT           (1 << 0)
+#define PTL_MD_OP_GET           (1 << 1)
+#define PTL_MD_MANAGE_REMOTE    (1 << 2)
+#define PTL_MD_AUTO_UNLINK      (1 << 3)
+#define PTL_MD_TRUNCATE         (1 << 4)
+#define PTL_MD_ACK_DISABLE      (1 << 5)
+#define PTL_MD_IOV             (1 << 6)
+#define PTL_MD_MAX_SIZE                (1 << 7)
+#define PTL_MD_KIOV             (1 << 8)
+
+#define PTL_MD_THRESH_INF       (-1)
+
+typedef enum {
+        PTL_EVENT_GET,
+        PTL_EVENT_PUT,
+        PTL_EVENT_REPLY,
+        PTL_EVENT_ACK,
+        PTL_EVENT_SENT
+} ptl_event_kind_t;
+
+#define PTL_SEQ_BASETYPE       long
+typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
+#define PTL_SEQ_GT(a,b)        (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+
+typedef struct {
+        ptl_event_kind_t type;
+        ptl_process_id_t initiator;
+        ptl_pt_index_t portal;
+        ptl_match_bits_t match_bits;
+        ptl_size_t rlength, mlength, offset;
+        ptl_handle_me_t unlinked_me;
+        ptl_md_t mem_desc;
+        ptl_hdr_data_t hdr_data;
+        cycles_t  arrival_time;
+        volatile ptl_seq_t sequence;
+} ptl_event_t;
+
+
+typedef enum {
+        PTL_ACK_REQ,
+        PTL_NOACK_REQ
+} ptl_ack_req_t;
+
+
+typedef struct {
+        volatile ptl_seq_t sequence;
+        ptl_size_t size;
+        ptl_event_t *base;
+        ptl_handle_any_t cb_eq_handle;
+} ptl_eq_t;
+
+typedef struct {
+        ptl_eq_t *eq;
+} ptl_ni_t;
+
+
+typedef struct {
+        int max_match_entries;    /* max number of match entries */
+        int max_mem_descriptors;  /* max number of memory descriptors */
+        int max_event_queues;     /* max number of event queues */
+        int max_atable_index;     /* maximum access control list table index */
+        int max_ptable_index;     /* maximum portals table index */
+} ptl_ni_limits_t;
+
+/*
+ * Status registers
+ */
+typedef enum {
+        PTL_SR_DROP_COUNT,
+        PTL_SR_DROP_LENGTH,
+        PTL_SR_RECV_COUNT,
+        PTL_SR_RECV_LENGTH,
+        PTL_SR_SEND_COUNT,
+        PTL_SR_SEND_LENGTH,
+        PTL_SR_MSGS_MAX,
+} ptl_sr_index_t;
+
+typedef int ptl_sr_value_t;
+
+#endif
diff --git a/lustre/portals/knals/Makefile.am b/lustre/portals/knals/Makefile.am
new file mode 100644 (file)
index 0000000..5c6085e
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS= socknal toenal        @QSWNAL@ @GMNAL@ @SCIMACNAL@
diff --git a/lustre/portals/knals/Makefile.mk b/lustre/portals/knals/Makefile.mk
new file mode 100644 (file)
index 0000000..ce40a60
--- /dev/null
@@ -0,0 +1,4 @@
+include ../Kernelenv
+
+obj-y = socknal/
+# more coming...
\ No newline at end of file
diff --git a/lustre/portals/knals/gmnal/Makefile.am b/lustre/portals/knals/gmnal/Makefile.am
new file mode 100644 (file)
index 0000000..1dc6f4e
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kgmnal
+modulenet_DATA = kgmnal.o
+EXTRA_PROGRAMS = kgmnal
+
+DEFS =
+kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
diff --git a/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch
new file mode 100644 (file)
index 0000000..23c80d9
--- /dev/null
@@ -0,0 +1,43 @@
+diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
+--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c        Mon Jul  1 10:35:09 2002
++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c    Thu Sep 19 14:19:38 2002
+@@ -30,6 +30,8 @@
+  *
+  ************************************************************************/
++#define EXPORT_SYMTAB
++
+ #include <linux/config.h>
+ #include <linux/module.h>
+@@ -4075,6 +4077,28 @@
+   return 0;
+ }
++EXPORT_SYMBOL(gm_blocking_receive_no_spin);
++EXPORT_SYMBOL(gm_close);
++EXPORT_SYMBOL(gm_dma_free);
++EXPORT_SYMBOL(gm_dma_malloc);
++EXPORT_SYMBOL(gm_drop_sends);
++EXPORT_SYMBOL(gm_finalize);
++EXPORT_SYMBOL(gm_get_node_id);
++EXPORT_SYMBOL(gm_init);
++EXPORT_SYMBOL(gm_initialize_alarm);
++EXPORT_SYMBOL(gm_max_node_id_in_use);
++EXPORT_SYMBOL(gm_min_size_for_length);
++EXPORT_SYMBOL(gm_num_receive_tokens);
++EXPORT_SYMBOL(gm_num_send_tokens);
++EXPORT_SYMBOL(gm_open);
++EXPORT_SYMBOL(gm_provide_receive_buffer);
++EXPORT_SYMBOL(gm_resume_sending);
++EXPORT_SYMBOL(gm_send_with_callback);
++EXPORT_SYMBOL(gm_set_acceptable_sizes);
++EXPORT_SYMBOL(gm_set_alarm);
++EXPORT_SYMBOL(gm_unknown);
++
++
+ /*
+   This file uses GM standard indentation.
+Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
+Only in gm-1.5.2.1_Linux-cfs/: trace
diff --git a/lustre/portals/knals/gmnal/gmnal.c b/lustre/portals/knals/gmnal/gmnal.c
new file mode 100644 (file)
index 0000000..ceeea2a
--- /dev/null
@@ -0,0 +1,284 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "gmnal.h"
+
+ptl_handle_ni_t kgmnal_ni;
+nal_t  kgmnal_api;
+
+kgmnal_data_t kgmnal_data;
+int gmnal_debug = 0;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+        kprni_nalid:        GMNAL,
+        kprni_arg:        NULL,
+        kprni_fwd:          kgmnal_fwd_packet,
+};
+
+static int kgmnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+        return PTL_OK;
+}
+
+static void kgmnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int kgmnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kgmnal_api);
+        return 0;
+}
+
+static void kgmnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kgmnal_api);
+
+        if (current->need_resched)
+                schedule();
+        return;
+}
+
+kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
+{
+        kgmnal_rx_t *conn;
+
+        PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
+        /* Check for out of mem here */
+        if (conn==NULL) {
+                        printk("kgm_add_recv: memory alloc failed\n");
+                        return NULL;
+        }
+
+        list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
+        //        conn->ndx=ndx;
+        //        conn->len=conn->ptlhdr_copied=0;
+        //        conn->loopback=0;
+        return conn;
+}
+
+static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
+                          ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        unsigned int nnids;
+
+        gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
+               kgmnal_data.kgm_nid, nnids);
+        lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
+        return &kgmnal_api;
+}
+
+static void __exit
+kgmnal_finalize(void)
+{
+        struct list_head *tmp;
+
+        PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
+        PtlNIFini(kgmnal_ni);
+        lib_fini(&kgmnal_api);
+
+        if (kgmnal_data.kgm_port) {
+                gm_close(kgmnal_data.kgm_port);
+        }
+
+        /* FIXME: free dma buffers */
+        /* FIXME: kill receiver thread */
+
+        PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
+
+        list_for_each(tmp, &kgmnal_data.kgm_list) {
+                kgmnal_rx_t *conn;
+                conn = list_entry(tmp, kgmnal_rx_t, krx_item);
+                CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
+                tmp = tmp->next;
+                list_del(&conn->krx_item);
+                PORTAL_FREE(conn, sizeof(*conn));
+        }
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+static int __init
+kgmnal_initialize(void)
+{
+        int rc;
+        int ntok;
+        unsigned long sizemask;
+        unsigned int nid;
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kgmnal_api.forward = kgmnal_forward;
+        kgmnal_api.shutdown = kgmnal_shutdown;
+        kgmnal_api.yield = kgmnal_yield;
+        kgmnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kgmnal_api.lock= kgmnal_lock;
+        kgmnal_api.unlock= kgmnal_unlock;
+        kgmnal_api.nal_data = &kgmnal_data;
+
+        kgmnal_lib.nal_data = &kgmnal_data;
+
+        memset(&kgmnal_data, 0, sizeof(kgmnal_data));
+
+        INIT_LIST_HEAD(&kgmnal_data.kgm_list);
+        kgmnal_data.kgm_cb = &kgmnal_lib;
+
+        /* Allocate transmit descriptors */
+        PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
+        if (kgmnal_data.kgm_trans==NULL) {
+                printk("kgmnal: init: failed to allocate transmit "
+                       "descriptors\n");
+                return -1;
+        }
+        memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
+
+        spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
+        spin_lock_init(&kgmnal_data.kgm_update_lock);
+        spin_lock_init(&kgmnal_data.kgm_send_lock);
+
+        /* Do the receiver and xmtr allocation */
+
+        rc = gm_init();
+        if (rc != GM_SUCCESS) {
+                CERROR("gm_init failed: %d\n", rc);
+                return -1;
+        }
+
+        rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
+                     GM_API_VERSION_1_1);
+        if (rc != GM_SUCCESS) {
+                gm_finalize();
+                kgmnal_data.kgm_port = NULL;
+                CERROR("gm_open failed: %d\n", rc);
+                return -1;
+        }
+        gm_get_node_id(kgmnal_data.kgm_port, &nid);
+        kgmnal_data.kgm_nid = nid;
+        /* Allocate 2 different sizes of buffers. For new, use half
+           the tokens for each. */
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
+               ntok, MSG_LEN_LARGE);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_LARGE);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+        }
+
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
+               ntok, MSG_LEN_SMALL);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_SMALL);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+        }
+        sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
+        CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
+                        kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
+                                sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
+
+        /* Initialize Network Interface */
+        rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                return (-ENOMEM);
+        }
+
+        /* Start receiver thread */
+        kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
+
+        PORTAL_SYMBOL_REGISTER(kgmnal_ni);
+
+        kgmnal_data.kgm_init = 1;
+
+        return 0;
+}
+
+MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
+MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
+MODULE_LICENSE("GPL");
+
+module_init (kgmnal_initialize);
+module_exit (kgmnal_finalize);
+
+EXPORT_SYMBOL (kgmnal_ni);
diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h
new file mode 100644 (file)
index 0000000..47e8c3c
--- /dev/null
@@ -0,0 +1,101 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _GMNAL_H
+#define _GMNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_GMNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <gm.h>
+
+
+/*
+ *  Myrinet GM NAL
+ */
+#define NPAGES_LARGE            16
+#define NPAGES_SMALL            1
+#define MSG_LEN_LARGE            NPAGES_LARGE*PAGE_SIZE
+#define MSG_LEN_SMALL            NPAGES_SMALL*PAGE_SIZE
+#define MSG_SIZE_LARGE           (gm_min_size_for_length(MSG_LEN_LARGE))
+#define MSG_SIZE_SMALL           (gm_min_size_for_length(MSG_LEN_SMALL))
+
+#define TXMSGS                  64 /* Number of Transmit Messages */
+#define ENVELOPES               8  /* Number of outstanding receive msgs */
+
+#define KGM_PORT_NUM 3
+#define KGM_HOSTNAME "kgmnal"
+
+
+typedef struct {
+        char *krx_buffer;
+        unsigned long   krx_len;
+        unsigned int   krx_size;
+        unsigned int   krx_priority;
+        struct list_head krx_item;
+}  kgmnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t  *ktx_nal;
+        void      *ktx_private;
+        lib_msg_t *ktx_cookie;
+        char      *ktx_buffer;
+        size_t     ktx_len;
+        unsigned long ktx_size;
+        int        ktx_ndx;
+        unsigned int ktx_priority;
+        unsigned int ktx_tgt_node;
+        unsigned int ktx_tgt_port_id;
+}  kgmnal_tx_t;
+
+
+typedef struct {
+        char              kgm_init;
+        char              kgm_shuttingdown;
+        struct gm_port   *kgm_port;
+        struct list_head  kgm_list;
+        ptl_nid_t         kgm_nid;
+        nal_cb_t         *kgm_cb;
+        struct kgm_trans *kgm_trans;
+        struct tq_struct  kgm_ready_tq;
+        spinlock_t        kgm_dispatch_lock;
+        spinlock_t        kgm_update_lock;
+        spinlock_t        kgm_send_lock;
+}  kgmnal_data_t;
+
+int kgm_init(kgmnal_data_t *kgm_data);
+int kgmnal_recv_thread(void *);
+int gm_return_mynid(void);
+void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+extern kgmnal_data_t      kgmnal_data;
+extern nal_t              kgmnal_api;
+extern nal_cb_t           kgmnal_lib;
+
+#endif  /* _GMNAL_H */
+
diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c
new file mode 100644 (file)
index 0000000..3d4c86d
--- /dev/null
@@ -0,0 +1,517 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* TODO
+ * preallocate send buffers, store on list
+ * put receive buffers on queue, handle with receive threads
+ * use routing
+ */
+
+#include "gmnal.h"
+
+extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
+
+static kgmnal_tx_t *
+get_trans(void)
+{
+        kgmnal_tx_t *t;
+        PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
+        return t;
+}
+
+static void
+put_trans(kgmnal_tx_t *t)
+{
+        PORTAL_FREE(t, sizeof(kgmnal_tx_t));
+}
+
+int
+kgmnal_ispeer (ptl_nid_t nid)
+{
+   unsigned int gmnid = (unsigned int)nid;
+   unsigned int nnids;
+
+   gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+   return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
+           gmnid < nnids); /* it's in this machine */
+}
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static int
+kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static void *
+kgmnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+static void
+kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list                ap;
+        char msg[256];
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void
+kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static void
+kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static int
+kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+/* FIXME rmr: add rounting code here */
+static void
+kgmnal_tx_done(kgmnal_tx_t  *trans, int error)
+{
+        lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
+
+        gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
+
+        trans->ktx_buffer = NULL;
+        trans->ktx_len = 0;
+
+        put_trans(trans);
+}
+static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
+        [GM_SUCCESS] = "GM_SUCCESS",
+        [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
+        [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
+        [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
+        [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
+        [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
+        [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
+};
+
+inline char * get_error(int status)
+{
+        if (gm_error_strings[status] != NULL)
+                return gm_error_strings[status];
+        else
+                return "Unknown error";
+}
+
+static void
+kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
+}
+
+static void
+kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
+        int err = 0;
+
+        LASSERT (p != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
+                ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
+
+        switch((int)status) {
+        case GM_SUCCESS:        /* normal */
+                break;
+        case GM_SEND_TIMED_OUT: /* application error */
+        case GM_SEND_REJECTED:  /* size of msg unacceptable */
+        case GM_SEND_TARGET_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
+                                  ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                                  kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_TARGET_NODE_UNREACHABLE:
+        case GM_SEND_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
+                              ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                              kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_DROPPED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                err = -EIO;
+                break;
+        default:
+                CERROR("Unknown status: %d\n", status);
+                err = -EIO;
+                break;
+        }
+
+        kgmnal_tx_done(ktx, err);
+}
+
+/*
+ */
+
+static int
+kgmnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type,
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           int              options,
+           unsigned int     niov,
+           lib_md_iov_t    *iov,
+           size_t           len)
+{
+        /*
+         * ipnal assumes that this is the private as passed to lib_dispatch..
+         * so do we :/
+         */
+        kgmnal_tx_t *ktx=NULL;
+        int rc=0;
+        void * buf;
+        int buf_len = sizeof(ptl_hdr_t) + len;
+        int buf_size = 0;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+        
+        PROF_START(gmnal_send);
+
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
+               len, iov, nid, KGM_PORT_NUM);
+
+        /* ensure there is an available tx handle */
+
+        /* save transaction info to trans for later finalize and cleanup */
+        ktx = get_trans();
+        if (ktx == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+
+        /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
+           header and data.
+           Also, memory must be dma'able or registered with GM. */
+
+        if (buf_len <= MSG_LEN_SMALL) {
+                buf_size = MSG_SIZE_SMALL;
+        } else if (buf_len <= MSG_LEN_LARGE) {
+                buf_size = MSG_SIZE_LARGE;
+        } else {
+                printk("kgmnal:request exceeds TX MTU size (%d).\n",
+                       MSG_SIZE_LARGE);
+                rc = -1;
+                goto send_exit;
+        }
+
+               buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
+        if (buf == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+        memcpy(buf, hdr, sizeof(ptl_hdr_t));
+
+        if (len != 0)
+                lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), 
+                                 options, niov, iov, len);
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+        ktx->ktx_len = buf_len;
+        ktx->ktx_size = buf_size;
+        ktx->ktx_buffer = buf;
+        ktx->ktx_priority = GM_LOW_PRIORITY;
+        ktx->ktx_tgt_node = nid;
+        ktx->ktx_tgt_port_id = KGM_PORT_NUM;
+
+        CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
+               "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
+               GM_LOW_PRIORITY);
+
+        gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
+                              buf_len, GM_LOW_PRIORITY,
+                              nid, KGM_PORT_NUM,
+                              kgmnal_txhandler, ktx);
+
+        PROF_FINISH(gmnal_send);
+ send_exit:
+        return rc;
+}
+void
+kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+static inline void
+kgmnal_requeue_rx(kgmnal_rx_t *krx)
+{
+        gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
+                                  krx->krx_size, krx->krx_priority);
+}
+
+/* Process a received portals packet */
+
+/* Receive Interrupt Handler */
+static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
+                      void * buf, unsigned int pri)
+{
+        ptl_hdr_t  *hdr = buf;
+        kgmnal_rx_t krx;
+
+        CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
+
+        if ( len < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (kgm->kgm_shuttingdown)
+                        return;
+                CERROR("kgmnal: did not receive complete portal header, "
+                       "len= %ld", len);
+                gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
+                return;
+        }
+
+       /* might want to use seperate threads to handle receive */
+        krx.krx_buffer = buf;
+        krx.krx_len = len;
+        krx.krx_size = size;
+        krx.krx_priority = pri;
+
+        if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
+                PROF_FINISH(lib_parse);
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx: target is "
+                       "a peer", hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented yet");
+                kgmnal_requeue_rx(&krx);
+        }
+
+        return;
+}
+
+
+static int kgmnal_recv(nal_cb_t     *nal,
+                      void         *private,
+                      lib_msg_t    *cookie,
+                      int           options,
+                      unsigned int  niov,
+                      lib_md_iov_t *iov,
+                      size_t        mlen,
+                      size_t        rlen)
+{
+        kgmnal_rx_t *krx = private;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+
+        CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
+
+        /* What was actually received must be >= what sender claims to
+         * have sent.  This is an LASSERT, since lib-move doesn't
+         * check cb return code yet. */
+        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+        LASSERT (mlen <= rlen);
+
+        PROF_START(gmnal_recv);
+
+        if(mlen != 0) {
+                PROF_START(memcpy);
+                lib_copy_buf2iov (options, niov, iov, 
+                                  krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
+                PROF_FINISH(memcpy);
+        }
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        kgmnal_requeue_rx(krx);
+
+        PROF_FINISH(gmnal_recv);
+
+        return rlen;
+}
+
+
+static void kgmnal_shutdown(void * none)
+{
+        CERROR("called\n");
+        return;
+}
+
+/*
+ * Set terminate and use alarm to wake up the recv thread.
+ */
+static void  recv_shutdown(kgmnal_data_t *kgm)
+{
+        gm_alarm_t alarm;
+
+        kgm->kgm_shuttingdown = 1;
+        gm_initialize_alarm(&alarm);
+        gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+}
+
+int kgmnal_end(kgmnal_data_t *kgm)
+{
+
+        /* wait for sends to finish ? */
+        /* remove receive buffers */
+        /* shutdown receive thread */
+
+        recv_shutdown(kgm);
+
+        return 0;
+}
+
+/* Used only for the spinner */
+int kgmnal_recv_thread(void *arg)
+{
+        kgmnal_data_t *kgm = arg;
+
+        LASSERT(kgm != NULL);
+
+        kportal_daemonize("kgmnal_rx");
+        
+        while(1) {
+                gm_recv_event_t *e;
+                int priority = GM_LOW_PRIORITY;
+                if (kgm->kgm_shuttingdown)
+                        break;
+
+                e = gm_blocking_receive_no_spin(kgm->kgm_port);
+                if (e == NULL) {
+                        CERROR("gm_blocking_receive returned NULL\n");
+                        break;
+                }
+
+                switch(gm_ntohc(e->recv.type)) {
+                case GM_HIGH_RECV_EVENT:
+                        priority = GM_HIGH_PRIORITY;
+                        /* fall through */
+                case GM_RECV_EVENT:
+                        kgmnal_rx(kgm, gm_ntohl(e->recv.length),
+                                  gm_ntohc(e->recv.size),
+                                  gm_ntohp(e->recv.buffer), priority);
+                        break;
+                case GM_ALARM_EVENT:
+                        CERROR("received alarm");
+                        gm_unknown(kgm->kgm_port, e);
+                        break;
+                case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
+                        CERROR("received bad send!\n");
+                        break;
+                default:
+                        gm_unknown(kgm->kgm_port, e);
+                }
+        }
+
+        CERROR("shuttting down.\n");
+        return 0;
+}
+
+nal_cb_t kgmnal_lib = {
+        nal_data: &kgmnal_data,                /* NAL private data */
+        cb_send: kgmnal_send,
+        cb_recv: kgmnal_recv,
+        cb_read: kgmnal_read,
+        cb_write: kgmnal_write,
+        cb_malloc: kgmnal_malloc,
+        cb_free: kgmnal_free,
+        cb_printf: kgmnal_printf,
+        cb_cli: kgmnal_cli,
+        cb_sti: kgmnal_sti,
+        cb_dist: kgmnal_dist
+};
diff --git a/lustre/portals/knals/qswnal/Makefile.am b/lustre/portals/knals/qswnal/Makefile.am
new file mode 100644 (file)
index 0000000..6759b96
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kqswnal
+modulenet_DATA = kqswnal.o
+EXTRA_PROGRAMS = kqswnal
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h
diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c
new file mode 100644 (file)
index 0000000..d64b7ad
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+ptl_handle_ni_t                kqswnal_ni;
+nal_t                  kqswnal_api;
+kqswnal_data_t         kqswnal_data;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+       kprni_nalid:    QSWNAL,
+       kprni_arg:      NULL,
+       kprni_fwd:      kqswnal_fwd_packet,
+};
+
+
+static int
+kqswnal_forward(nal_t   *nal,
+               int     id,
+               void    *args,  size_t args_len,
+               void    *ret,   size_t ret_len)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+       return (PTL_OK);
+}
+
+static void
+kqswnal_lock (nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void
+kqswnal_unlock(nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int
+kqswnal_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+
+       LASSERT (nal == &kqswnal_api);
+       return (0);
+}
+
+static void
+kqswnal_yield( nal_t *nal )
+{
+       CDEBUG (D_NET, "yield\n");
+
+       if (current->need_resched)
+               schedule();
+       return;
+}
+
+static nal_t *
+kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
+            ptl_pid_t requested_pid)
+{
+       ptl_nid_t mynid = ep_nodeid (kqswnal_data.kqn_epdev);
+       int       nnids = ep_numnodes (kqswnal_data.kqn_epdev);
+
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid,nnids);
+
+       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
+
+       return (&kqswnal_api);
+}
+
+void __exit
+kqswnal_finalise (void)
+{
+       switch (kqswnal_data.kqn_init)
+       {
+       default:
+               LASSERT (0);
+
+       case KQN_INIT_ALL:
+               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
+               /* fall through */
+
+       case KQN_INIT_PTL:
+               PtlNIFini (kqswnal_ni);
+               lib_fini (&kqswnal_lib);
+               /* fall through */
+
+       case KQN_INIT_DATA:
+               break;
+
+       case KQN_INIT_NOTHING:
+               return;
+       }
+
+       /**********************************************************************/
+       /* Make router stop her calling me and fail any more call-ins */
+       kpr_shutdown (&kqswnal_data.kqn_router);
+
+       /**********************************************************************/
+       /* flag threads to terminate, wake them and wait for them to die */
+
+       kqswnal_data.kqn_shuttingdown = 1;
+       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+
+       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
+               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
+                      atomic_read (&kqswnal_data.kqn_nthreads));
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+
+       /**********************************************************************/
+       /* close elan comms */
+
+       if (kqswnal_data.kqn_eprx_small != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
+
+       if (kqswnal_data.kqn_eprx_large != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
+
+       if (kqswnal_data.kqn_eptx != NULL)
+               ep_free_large_xmtr (kqswnal_data.kqn_eptx);
+
+       /**********************************************************************/
+       /* No more threads.  No more portals, router or comms callbacks!
+        * I control the horizontals and the verticals...
+        */
+
+       /**********************************************************************/
+       /* Complete any blocked forwarding packets with error
+        */
+
+       while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       /**********************************************************************/
+       /* Wait for router to complete any packets I sent her
+        */
+
+       kpr_deregister (&kqswnal_data.kqn_router);
+
+
+       /**********************************************************************/
+       /* Unmap message buffers and free all descriptors and buffers
+        */
+
+       if (kqswnal_data.kqn_eprxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle, 0,
+                                 KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                                 KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_eptxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle, 0,
+                                 KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
+                                                     KQSW_NNBLK_TXMSGS));
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_txds != NULL)
+       {
+               int   i;
+
+               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
+               {
+                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+
+                       if (ktx->ktx_buffer != NULL)
+                               PORTAL_FREE(ktx->ktx_buffer,
+                                           KQSW_TX_BUFFER_SIZE);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_txds,
+                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
+                                                    KQSW_NNBLK_TXMSGS));
+       }
+
+       if (kqswnal_data.kqn_rxds != NULL)
+       {
+               int   i;
+               int   j;
+
+               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+               {
+                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+                       for (j = 0; j < krx->krx_npages; j++)
+                               if (krx->krx_pages[j] != NULL)
+                                       __free_page (krx->krx_pages[j]);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_rxds,
+                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
+                                                   KQSW_NRXMSGS_LARGE));
+       }
+
+       /* resets flags, pointers to NULL etc */
+       memset(&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
+
+       printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+                atomic_read(&portal_kmemory));
+}
+
+static int __init
+kqswnal_initialise (void)
+{
+       ELAN3_DMA_REQUEST dmareq;
+       int               rc;
+       int               i;
+       int               elan_page_idx;
+       int               pkmem = atomic_read(&portal_kmemory);
+
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+
+       kqswnal_api.forward  = kqswnal_forward;
+       kqswnal_api.shutdown = kqswnal_shutdown;
+       kqswnal_api.yield    = kqswnal_yield;
+       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
+       kqswnal_api.lock     = kqswnal_lock;
+       kqswnal_api.unlock   = kqswnal_unlock;
+       kqswnal_api.nal_data = &kqswnal_data;
+
+       kqswnal_lib.nal_data = &kqswnal_data;
+
+       /* ensure all pointers NULL etc */
+       memset (&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       kqswnal_data.kqn_cb = &kqswnal_lib;
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
+       spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq);
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
+
+       spin_lock_init (&kqswnal_data.kqn_sched_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
+
+       spin_lock_init (&kqswnal_data.kqn_statelock);
+
+       /* pointers/lists/locks initialised */
+       kqswnal_data.kqn_init = KQN_INIT_DATA;
+
+       /**********************************************************************/
+       /* Find the first Elan device */
+
+       kqswnal_data.kqn_epdev = ep_device (0);
+       if (kqswnal_data.kqn_epdev == NULL)
+       {
+               CERROR ("Can't get elan device 0\n");
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the transmitter */
+
+       kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev);
+       if (kqswnal_data.kqn_eptx == NULL)
+       {
+               CERROR ("Can't allocate transmitter\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the receivers */
+
+       kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_SMALL,
+                                                            KQSW_EP_ENVELOPES_SMALL);
+       if (kqswnal_data.kqn_eprx_small == NULL)
+       {
+               CERROR ("Can't install small msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_LARGE,
+                                                            KQSW_EP_ENVELOPES_LARGE);
+       if (kqswnal_data.kqn_eprx_large == NULL)
+       {
+               CERROR ("Can't install large msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for transmit buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEREAD;
+
+       rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState,
+                             KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                             &dmareq, &kqswnal_data.kqn_eptxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for receive buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
+
+       rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState,
+                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               &dmareq, &kqswnal_data.kqn_eprxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise transmit descriptors */
+
+       PORTAL_ALLOC(kqswnal_data.kqn_txds,
+                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       if (kqswnal_data.kqn_txds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /* clear flags, null pointers etc */
+       memset(kqswnal_data.kqn_txds, 0,
+              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
+       {
+               int           premapped_pages;
+               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+               int           basepage = i * KQSW_NTXMSGPAGES;
+
+               PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+               if (ktx->ktx_buffer == NULL)
+               {
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+
+               /* Map pre-allocated buffer NOW, to save latency on transmit */
+               premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
+                                                       KQSW_TX_BUFFER_SIZE);
+
+               elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                      kqswnal_data.kqn_eptxdmahandle,
+                                      ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
+                                      basepage, &ktx->ktx_ebuffer);
+
+               ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
+               ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
+
+               if (i < KQSW_NTXMSGS)
+                       ktx->ktx_idle = &kqswnal_data.kqn_idletxds;
+               else
+                       ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds;
+
+               list_add_tail (&ktx->ktx_list, ktx->ktx_idle);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise receive descriptors */
+
+       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
+                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
+       if (kqswnal_data.kqn_rxds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
+              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
+
+       elan_page_idx = 0;
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               E3_Addr       elanaddr;
+               int           j;
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               if (i < KQSW_NRXMSGS_SMALL)
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_small;
+               }
+               else
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_large;
+               }
+
+               LASSERT (krx->krx_npages > 0);
+               for (j = 0; j < krx->krx_npages; j++)
+               {
+                       krx->krx_pages[j] = alloc_page (GFP_KERNEL);
+                       if (krx->krx_pages[j] == NULL)
+                       {
+                               kqswnal_finalise ();
+                               return (-ENOMEM);
+                       }
+
+                       LASSERT(page_address(krx->krx_pages[j]) != NULL);
+
+                       elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState,
+                                             kqswnal_data.kqn_eprxdmahandle,
+                                             page_address(krx->krx_pages[j]),
+                                             PAGE_SIZE, elan_page_idx,
+                                             &elanaddr);
+                       elan_page_idx++;
+
+                       if (j == 0)
+                               krx->krx_elanaddr = elanaddr;
+
+                       /* NB we assume a contiguous  */
+                       LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE);
+               }
+       }
+       LASSERT (elan_page_idx ==
+                (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) +
+                (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE));
+
+       /**********************************************************************/
+       /* Network interface ready to initialise */
+
+        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
+        if (rc != 0)
+       {
+               CERROR ("PtlNIInit failed %d\n", rc);
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_init = KQN_INIT_PTL;
+
+       /**********************************************************************/
+       /* Queue receives, now that it's OK to run their completion callbacks */
+
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               /* NB this enqueue can allocate/sleep (attr == 0) */
+               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
+                                     krx->krx_elanaddr,
+                                     krx->krx_npages * PAGE_SIZE, 0);
+               if (rc != 0)
+               {
+                       CERROR ("failed ep_queue_receive %d\n", rc);
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+       }
+
+       /**********************************************************************/
+       /* Spawn scheduling threads */
+       for (i = 0; i < smp_num_cpus; i++)
+       {
+               rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
+               if (rc != 0)
+               {
+                       CERROR ("failed to spawn scheduling thread: %d\n", rc);
+                       kqswnal_finalise ();
+                       return (rc);
+               }
+       }
+
+       /**********************************************************************/
+       /* Connect to the router */
+       rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
+       CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
+
+       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
+       kqswnal_data.kqn_init = KQN_INIT_ALL;
+
+       printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+              "(Routing %s, initial mem %d)\n", 
+              ep_nodeid (kqswnal_data.kqn_epdev),
+              ep_numnodes (kqswnal_data.kqn_epdev),
+              kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
+              pkmem);
+
+       return (0);
+}
+
+
+MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>");
+MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00");
+MODULE_LICENSE("GPL");
+
+module_init (kqswnal_initialise);
+module_exit (kqswnal_finalise);
+
+EXPORT_SYMBOL (kqswnal_ni);
diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h
new file mode 100644 (file)
index 0000000..657b02b
--- /dev/null
@@ -0,0 +1,249 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _QSWNAL_H
+#define _QSWNAL_H
+#define EXPORT_SYMTAB
+
+#ifdef PROPRIETARY_ELAN
+# include <qsw/kernel.h>
+#else
+# include <qsnet/kernel.h>
+#endif
+
+#undef printf                                   /* nasty QSW #define */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <elan3/elanregs.h>
+#include <elan3/elandev.h>
+#include <elan3/elanvp.h>
+#include <elan3/elan3mmu.h>
+#include <elan3/elanctxt.h>
+#include <elan3/elandebug.h>
+#include <elan3/urom_addrs.h>
+#include <elan3/busops.h>
+#include <elan3/kcomm.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_QSWNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define KQSW_CHECKSUM  0
+#if KQSW_CHECKSUM
+typedef unsigned long kqsw_csum_t;
+#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t))
+#else
+#define KQSW_CSUM_SIZE 0
+#endif
+#define KQSW_HDR_SIZE  (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE)
+
+/*
+ *  Elan NAL
+ */
+#define EP_SVC_LARGE_PORTALS_SMALL     (0x10)  /* Portals over elan port number (large payloads) */
+#define EP_SVC_LARGE_PORTALS_LARGE     (0x11)  /* Portals over elan port number (small payloads) */
+/* NB small/large message sizes are GLOBAL constants */
+
+/*
+ * Performance Tuning defines
+ * NB no mention of PAGE_SIZE for interoperability
+ */
+#if PTL_LARGE_MTU
+# define KQSW_MAXPAYLOAD               (256<<10) /* biggest message this NAL will cope with */
+#else
+# define KQSW_MAXPAYLOAD               (64<<10) /* biggest message this NAL will cope with */
+#endif
+
+#define KQSW_SMALLPAYLOAD              ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
+
+#define KQSW_TX_MAXCONTIG              (1<<10) /* largest payload that gets made contiguous on transmit */
+
+#define KQSW_NTXMSGS                   8       /* # normal transmit messages */
+#define KQSW_NNBLK_TXMSGS              128     /* # reserved transmit messages if can't block */
+
+#define KQSW_NRXMSGS_LARGE             64      /* # large receive buffers */
+#define KQSW_EP_ENVELOPES_LARGE        128     /* # large ep envelopes */
+
+#define KQSW_NRXMSGS_SMALL             256     /* # small receive buffers */
+#define KQSW_EP_ENVELOPES_SMALL                2048    /* # small ep envelopes */
+
+#define KQSW_RESCHED                   100     /* # busy loops that forces scheduler to yield */
+
+/*
+ * derived constants
+ */
+
+#define KQSW_TX_BUFFER_SIZE    (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG)
+/* The pre-allocated tx buffer (hdr + small payload) */
+
+#define KQSW_NTXMSGPAGES       (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1)
+/* Reserve elan address space for pre-allocated and pre-mapped transmit
+ * buffer and a full payload too.  Extra pages allow for page alignment */
+
+#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
+
+#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
+/* biggest complete packet we can receive (or transmit) */
+
+
+typedef struct 
+{
+        struct list_head krx_list;              /* enqueue -> thread */
+        EP_RCVR                *krx_eprx;              /* port to post receives to */
+        EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
+        E3_Addr          krx_elanaddr;          /* Elan address of buffer (contiguous in elan vm) */
+        int              krx_npages;            /* # pages in receive buffer */
+        int              krx_nob;               /* Number Of Bytes received into buffer */
+        kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
+        struct page     *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */
+        struct iovec     krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */
+}  kqswnal_rx_t;
+
+typedef struct
+{
+        struct list_head  ktx_list;             /* enqueue idle/delayed */
+        struct list_head *ktx_idle;             /* where to put when idle */
+        char              ktx_state;            /* What I'm doing */
+        uint32_t          ktx_basepage;         /* page offset in reserved elan tx vaddrs for mapping pages */
+        int               ktx_npages;           /* pages reserved for mapping messages */
+        int               ktx_nmappedpages;     /* # pages mapped for current message */
+        EP_IOVEC         ktx_iov[EP_MAXFRAG];  /* msg frags (elan vaddrs) */
+        int               ktx_niov;             /* # message frags */
+        int               ktx_port;             /* destination ep port */
+        ptl_nid_t         ktx_nid;              /* destination node */
+        void             *ktx_args[2];          /* completion passthru */
+        E3_Addr                  ktx_ebuffer;          /* elan address of ktx_buffer */
+        char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
+} kqswnal_tx_t;
+
+#define KTX_IDLE       0                       /* MUST BE ZERO (so zeroed ktx is idle) */
+#define KTX_SENDING    1                       /* local send */
+#define KTX_FORWARDING 2                       /* routing a packet */
+
+typedef struct
+{
+        char               kqn_init;            /* what's been initialised */
+        char               kqn_shuttingdown;    /* I'm trying to shut down */
+        atomic_t           kqn_nthreads;        /* # threads still running */
+
+        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+
+        struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
+        struct list_head   kqn_nblk_idletxds;   /* reserve of */
+        spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
+        wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
+        struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
+        
+        spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
+        wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
+
+        struct list_head   kqn_readyrxds;       /* rxds full of data */
+        struct list_head   kqn_delayedfwds;     /* delayed forwards */
+        struct list_head   kqn_delayedtxds;     /* delayed transmits */
+
+        spinlock_t         kqn_statelock;       /* cb_cli/cb_sti */
+        nal_cb_t          *kqn_cb;              /* -> kqswnal_lib */
+       EP_DEV            *kqn_epdev;           /* elan device */
+       EP_XMTR           *kqn_eptx;            /* elan transmitter */
+       EP_RCVR           *kqn_eprx_small;      /* elan receiver (small messages) */
+        EP_RCVR                  *kqn_eprx_large;      /* elan receiver (large messages) */
+       ELAN3_DMA_HANDLE  *kqn_eptxdmahandle;   /* elan reserved tx vaddrs */
+       ELAN3_DMA_HANDLE  *kqn_eprxdmahandle;   /* elan reserved rx vaddrs */
+        kpr_router_t       kqn_router;          /* connection to Kernel Portals Router module */
+}  kqswnal_data_t;
+
+/* kqn_init state */
+#define KQN_INIT_NOTHING       0               /* MUST BE ZERO so zeroed state is initialised OK */
+#define KQN_INIT_DATA          1
+#define KQN_INIT_PTL           2
+#define KQN_INIT_ALL           3
+
+extern nal_cb_t        kqswnal_lib;
+extern nal_t           kqswnal_api;
+extern kqswnal_data_t  kqswnal_data;
+
+extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
+extern void kqswnal_rxhandler(EP_RXD *rxd);
+extern int kqswnal_scheduler (void *);
+extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+static inline void
+kqswnal_requeue_rx (kqswnal_rx_t *krx)
+{
+        ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx,
+                            krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE);
+}
+
+static inline int
+kqswnal_pages_spanned (void *base, int nob)
+{
+        unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
+        unsigned long last_page  = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
+
+        LASSERT (last_page >= first_page);      /* can't wrap address space */
+        return (last_page - first_page + 1);
+}
+
+#if KQSW_CHECKSUM
+static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
+{
+        unsigned char *ptr = (unsigned char *)base;
+        
+        while (nob-- > 0)
+                sum += *ptr++;
+        
+        return (sum);
+}
+#endif
+
+#endif /* _QSWNAL_H */
diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c
new file mode 100644 (file)
index 0000000..5979885
--- /dev/null
@@ -0,0 +1,1242 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+atomic_t kqswnal_packets_launched;
+atomic_t kqswnal_packets_transmitted;
+atomic_t kqswnal_packets_received;
+
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static int
+kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+              size_t len)
+{
+        CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static void *
+kqswnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return (buf);
+}
+
+static void
+kqswnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kqswnal_printf (nal_cb_t * nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap);        /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;                /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+
+static void
+kqswnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kqn_statelock, *flags);
+}
+
+
+static void
+kqswnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kqn_statelock, *flags);
+}
+
+
+static int
+kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        *dist = (nid == nal->ni.nid) ? 0 : 1;
+        return (0);
+}
+
+int
+kqswnal_ispeer (ptl_nid_t nid)
+{
+        unsigned int elanid = (unsigned int)nid;
+
+        /* didn't lose high bits on conversion and it's in this machine? */
+        return ((ptl_nid_t)elanid == nid &&
+                elanid < ep_numnodes (kqswnal_data.kqn_epdev));
+}
+
+void
+kqswnal_unmap_tx (kqswnal_tx_t *ktx)
+{
+        if (ktx->ktx_nmappedpages == 0)
+                return;
+
+        CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n",
+                ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages);
+
+        LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages);
+        LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <=
+                 kqswnal_data.kqn_eptxdmahandle->NumDvmaPages);
+
+        elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                          kqswnal_data.kqn_eptxdmahandle,
+                          ktx->ktx_basepage, ktx->ktx_nmappedpages);
+        ktx->ktx_nmappedpages = 0;
+}
+
+int
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+        char     *ptr;
+        
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        do {
+                int  fraglen = kiov->kiov_len;
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                /* each frag fits in a page */
+                LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
+
+                nmapped++;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                /* XXX this is really crap, but we'll have to kmap until
+                 * EKC has a page (rather than vaddr) mapping interface */
+
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, page %d, %d total\n",
+                        ktx, nfrags, ptr, fraglen, basepage, nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       ptr, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+
+                kunmap (kiov->kiov_page);
+                
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage++;
+                kiov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+int
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        do {
+                int  fraglen = iov->iov_len;
+                long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                
+                nmapped += npages;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
+                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
+                        nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       iov->iov_base, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage += npages;
+                iov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+void
+kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
+{
+        kpr_fwd_desc_t   *fwd = NULL;
+        struct list_head *idle = ktx->ktx_idle;
+        unsigned long     flags;
+
+        kqswnal_unmap_tx (ktx);                /* release temporary mappings */
+        ktx->ktx_state = KTX_IDLE;
+
+        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        list_add (&ktx->ktx_list, idle);
+
+        /* reserved for non-blocking tx */
+        if (idle == &kqswnal_data.kqn_nblk_idletxds) {
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+                return;
+        }
+
+        /* anything blocking for a tx descriptor? */
+        if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
+        {
+                CDEBUG(D_NET,"wakeup fwd\n");
+
+                fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                  kpr_fwd_desc_t, kprfd_list);
+                list_del (&fwd->kprfd_list);
+        }
+
+        if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq))  /* process? */
+        {
+                /* local sender waiting for tx desc */
+                CDEBUG(D_NET,"wakeup process\n");
+                wake_up (&kqswnal_data.kqn_idletxd_waitq);
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        if (fwd == NULL)
+                return;
+
+        /* schedule packet for forwarding again */
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+kqswnal_tx_t *
+kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
+{
+        unsigned long  flags;
+        kqswnal_tx_t  *ktx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kqswnal_data.kqn_idletxds)) {
+                        ktx = list_entry (kqswnal_data.kqn_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* "normal" descriptor pool is empty */
+
+                if (fwd != NULL) { /* forwarded packet => queue for idle txd */
+                        CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
+                        list_add_tail (&fwd->kprfd_list,
+                                       &kqswnal_data.kqn_idletxd_fwdq);
+                        break;
+                }
+
+                /* doing a local transmit */
+                if (!may_block) {
+                        if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) {
+                                CERROR ("intr tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                CDEBUG (D_NET, "blocking for tx desc\n");
+                wait_event (kqswnal_data.kqn_idletxd_waitq,
+                            !list_empty (&kqswnal_data.kqn_idletxds));
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
+        LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0);
+        return (ktx);
+}
+
+void
+kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
+{
+        switch (ktx->ktx_state) {
+        case KTX_FORWARDING:       /* router asked me to forward this packet */
+                kpr_fwd_done (&kqswnal_data.kqn_router,
+                              (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
+                break;
+
+        case KTX_SENDING:          /* packet sourced locally */
+                lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+                              (lib_msg_t *)ktx->ktx_args[1]);
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        kqswnal_put_idle_tx (ktx);
+}
+
+static void
+kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
+{
+        kqswnal_tx_t      *ktx = (kqswnal_tx_t *)arg;
+
+        LASSERT (txd != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status);
+
+        if (status == EP_SUCCESS)
+                atomic_inc (&kqswnal_packets_transmitted);
+
+        if (status != EP_SUCCESS)
+        {
+                CERROR ("kqswnal: Transmit failed with %d\n", status);
+                status = -EIO;
+        }
+
+        kqswnal_tx_done (ktx, status);
+}
+
+int
+kqswnal_launch (kqswnal_tx_t *ktx)
+{
+        /* Don't block for transmit descriptor if we're in interrupt context */
+        int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
+        int   rc   = ep_transmit_large(kqswnal_data.kqn_eptx, ktx->ktx_nid,
+                                       ktx->ktx_port, attr, kqswnal_txhandler,
+                                       ktx, ktx->ktx_iov, ktx->ktx_niov);
+        long  flags;
+
+        if (rc == 0)
+                atomic_inc (&kqswnal_packets_launched);
+
+        if (rc != ENOMEM)
+                return (rc);
+
+        /* can't allocate ep txd => queue for later */
+
+        LASSERT (in_interrupt());      /* not called by thread (not looping) */
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        return (0);
+}
+
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+static void
+kqswnal_cerror_hdr(ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        CERROR("P3 Header at %p of type %s\n", hdr, type_str);
+        CERROR("    From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid),
+               NTOH__u32(hdr->src_pid));
+        CERROR("    To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid),
+               NTOH__u32(hdr->dest_pid));
+
+        switch (NTOH__u32(hdr->type)) {
+        case PTL_MSG_PUT:
+                CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.wh_interface_cookie,
+                       hdr->msg.put.ack_wmd.wh_object_cookie,
+                       NTOH__u64 (hdr->msg.put.match_bits));
+                CERROR("    Length %d, offset %d, hdr data "LPX64"\n",
+                       NTOH__u32(PTL_HDR_LENGTH(hdr)),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.get.ptl_index),
+                       hdr->msg.get.return_wmd.wh_interface_cookie,
+                       hdr->msg.get.return_wmd.wh_object_cookie,
+                       hdr->msg.get.match_bits);
+                CERROR("    Length %d, src offset %d\n",
+                       NTOH__u32 (hdr->msg.get.sink_length),
+                       NTOH__u32 (hdr->msg.get.src_offset));
+                break;
+
+        case PTL_MSG_ACK:
+                CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (hdr->msg.ack.mlength));
+                break;
+
+        case PTL_MSG_REPLY:
+                CERROR("    dst md "LPX64"."LPX64", length %d\n",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (PTL_HDR_LENGTH(hdr)));
+        }
+
+}                               /* end of print_hdr() */
+
+static int
+kqswnal_sendmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 ptl_kiov_t   *payload_kiov,
+                 size_t        payload_nob)
+{
+        kqswnal_tx_t      *ktx;
+        int                rc;
+        ptl_nid_t          gatewaynid;
+#if KQSW_CHECKSUM
+        int                i;
+        kqsw_csum_t        csum;
+        int                sumnob;
+#endif
+        
+        /* NB, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
+               " pid %u\n", payload_nob, payload_niov, nid, pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        
+        if (payload_nob > KQSW_MAXPAYLOAD) {
+                CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
+                        payload_nob, KQSW_MAXPAYLOAD);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        if (!kqswnal_ispeer (nid)) {     /* Can't send direct: find gateway? */
+                rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                if (!kqswnal_ispeer (gatewaynid)) {
+                        CERROR("Bad gateway "LPX64" for "LPX64"\n",
+                               gatewaynid, nid);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                nid = gatewaynid;
+        }
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
+                                          type == PTL_MSG_REPLY ||
+                                          in_interrupt()));
+        if (ktx == NULL) {
+                kqswnal_cerror_hdr (hdr);
+                lib_finalize (&kqswnal_lib, private, cookie);
+        }
+
+        memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
+
+#if KQSW_CHECKSUM
+        csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
+        memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
+        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+                if (payload_kiov != NULL) {
+                        ptl_kiov_t *kiov = &payload_kiov[i];
+                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
+                                           kiov->kiov_offset;
+                        
+                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
+                        sumnob -= kiov->kiov_len;
+                } else {
+                        struct iovec *iov = &payload_iov[i];
+
+                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
+                        sumnob -= iov->iov_len;
+                }
+        }
+        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+#endif
+
+        /* Set up first frag from pre-mapped buffer (it's at least the
+         * portals header) */
+        ktx->ktx_iov[0].Base = ktx->ktx_ebuffer;
+        ktx->ktx_iov[0].Len = KQSW_HDR_SIZE;
+        ktx->ktx_niov = 1;
+
+        if (payload_nob > 0) { /* got some payload (something more to do) */
+                /* make a single contiguous message? */
+                if (payload_nob <= KQSW_TX_MAXCONTIG) {
+                        /* copy payload to ktx_buffer, immediately after hdr */
+                        if (payload_kiov != NULL)
+                                lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                   payload_niov, payload_kiov, payload_nob);
+                        else
+                                lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                  payload_niov, payload_iov, payload_nob);
+                        /* first frag includes payload */
+                        ktx->ktx_iov[0].Len += payload_nob;
+                } else {
+                        if (payload_kiov != NULL)
+                                rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                                                          payload_niov, payload_kiov);
+                        else
+                                rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                                                         payload_niov, payload_iov);
+                        if (rc != 0) {
+                                kqswnal_put_idle_tx (ktx);
+                                lib_finalize (&kqswnal_lib, private, cookie);
+                                return (-1);
+                        }
+                } 
+        }
+
+        ktx->ktx_port    = (payload_nob <= KQSW_SMALLPAYLOAD) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_SENDING;   /* => lib_finalize() on completion */
+        ktx->ktx_args[0] = private;
+        ktx->ktx_args[1] = cookie;
+
+        rc = kqswnal_launch (ktx);
+        if (rc != 0) {                    /* failed? */
+                CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid);
+        return (0);
+}
+
+static int
+kqswnal_send (nal_cb_t     *nal,
+              void         *private,
+              lib_msg_t    *cookie,
+              ptl_hdr_t    *hdr,
+              int           type,
+              ptl_nid_t     nid,
+              ptl_pid_t     pid,
+              unsigned int  payload_niov,
+              struct iovec *payload_iov,
+              size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kqswnal_send_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    ptl_hdr_t    *hdr,
+                    int           type,
+                    ptl_nid_t     nid,
+                    ptl_pid_t     pid,
+                    unsigned int  payload_niov,
+                    ptl_kiov_t   *payload_kiov,
+                    size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+int kqswnal_fwd_copy_contig = 0;
+
+void
+kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        int             rc;
+        kqswnal_tx_t   *ktx;
+        struct iovec   *iov = fwd->kprfd_iov;
+        int             niov = fwd->kprfd_niov;
+        int             nob = fwd->kprfd_nob;
+        ptl_nid_t       nid = fwd->kprfd_gateway_nid;
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        /* The router wants this NAL to forward a packet */
+        CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n",
+                fwd, nid, niov, nob);
+
+        LASSERT (niov > 0);
+        
+        ktx = kqswnal_get_idle_tx (fwd, FALSE);
+        if (ktx == NULL)        /* can't get txd right now */
+                return;         /* fwd will be scheduled when tx desc freed */
+
+        if (nid == kqswnal_lib.ni.nid)          /* gateway is me */
+                nid = fwd->kprfd_target_nid;    /* target is final dest */
+
+        if (!kqswnal_ispeer (nid)) {
+                CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
+                rc = -EHOSTUNREACH;
+                goto failed;
+        }
+
+        if (nob > KQSW_NRXMSGBYTES_LARGE) {
+                CERROR ("Can't forward [%p] to "LPX64
+                        ": size %d bigger than max packet size %ld\n",
+                        fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE);
+                rc = -EMSGSIZE;
+                goto failed;
+        }
+
+        if ((kqswnal_fwd_copy_contig || niov > 1) &&
+            nob <= KQSW_TX_BUFFER_SIZE) 
+        {
+                /* send from ktx's pre-allocated/mapped contiguous buffer? */
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */
+                ktx->ktx_iov[0].Len = nob;
+                ktx->ktx_niov = 1;
+        }
+        else
+        {
+                /* zero copy */
+                ktx->ktx_niov = 0;        /* no frags mapped yet */
+                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                if (rc != 0)
+                        goto failed;
+        }
+
+        ktx->ktx_port    = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_FORWARDING; /* kpr_put_packet() on completion */
+        ktx->ktx_args[0] = fwd;
+
+        rc = kqswnal_launch (ktx);
+        if (rc == 0)
+                return;
+
+ failed:
+        LASSERT (rc != 0);
+        CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
+
+        kqswnal_put_idle_tx (ktx);
+        /* complete now (with failure) */
+        kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc);
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)arg;
+
+        /* The router has finished forwarding this packet */
+
+        if (error != 0)
+        {
+                ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error);
+        }
+
+        kqswnal_requeue_rx (krx);
+}
+
+void
+kqswnal_rx (kqswnal_rx_t *krx)
+{
+        ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]);
+        ptl_nid_t       dest_nid = NTOH__u64 (hdr->dest_nid);
+        int             nob;
+        int             niov;
+
+        if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */
+                /* NB krx requeued when lib_parse() calls back kqswnal_recv */
+                lib_parse (&kqswnal_lib, hdr, krx);
+                return;
+        }
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        if (kqswnal_ispeer (dest_nid))  /* should have gone direct to peer */
+        {
+                CERROR("dropping packet from "LPX64" for "LPX64
+                       ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        /* NB forwarding may destroy iov; rebuild every time */
+        for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++)
+        {
+                LASSERT (niov < krx->krx_npages);
+                krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]);
+                krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob);
+        }
+
+        kpr_fwd_init (&krx->krx_fwd, dest_nid,
+                      krx->krx_nob, niov, krx->krx_iov,
+                      kqswnal_fwd_callback, krx);
+
+        kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd);
+}
+
+/* Receive Interrupt Handler: posts to schedulers */
+void 
+kqswnal_rxhandler(EP_RXD *rxd)
+{
+        long          flags;
+        int           nob    = ep_rxd_len (rxd);
+        int           status = ep_rxd_status (rxd);
+        kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
+
+        CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
+               rxd, krx, nob, status);
+
+        LASSERT (krx != NULL);
+
+        krx->krx_rxd = rxd;
+        krx->krx_nob = nob;
+
+        /* must receive a whole header to be able to parse */
+        if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
+        {
+                /* receives complete with failure when receiver is removed */
+                if (kqswnal_data.kqn_shuttingdown)
+                        return;
+
+                CERROR("receive status failed with status %d nob %d\n",
+                       ep_rxd_status(rxd), nob);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        atomic_inc (&kqswnal_packets_received);
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+#if KQSW_CHECKSUM
+void
+kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
+{
+        ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+        CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
+                ", dpid %d, spid %d, type %d\n",
+                ishdr ? "Header" : "Payload", krx,
+                NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid)
+                NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid),
+                NTOH__u32(hdr->type));
+
+        switch (NTOH__u32 (hdr->type))
+        {
+        case PTL_MSG_ACK:
+                CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
+                       " len %u\n",
+                       NTOH__u32(hdr->msg.ack.mlength),
+                       hdr->msg.ack.dst_wmd.handle_cookie,
+                       hdr->msg.ack.dst_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.ack.match_bits),
+                       NTOH__u32(hdr->msg.ack.length));
+                break;
+        case PTL_MSG_PUT:
+                CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
+                       " len %u off %u data "LPX64"\n",
+                       NTOH__u32(hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.handle_cookie,
+                       hdr->msg.put.ack_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.put.match_bits),
+                       NTOH__u32(hdr->msg.put.length),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+        case PTL_MSG_GET:
+                CERROR ("GET: <>\n");
+                break;
+        case PTL_MSG_REPLY:
+                CERROR ("REPLY: <>\n");
+                break;
+        default:
+                CERROR ("TYPE?: <>\n");
+        }
+}
+#endif
+
+static int
+kqswnal_recvmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 unsigned int  niov,
+                 struct iovec *iov,
+                 ptl_kiov_t   *kiov,
+                 size_t        mlen,
+                 size_t        rlen)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
+        int           page;
+        char         *page_ptr;
+        int           page_nob;
+        char         *iov_ptr;
+        int           iov_nob;
+        int           frag;
+#if KQSW_CHECKSUM
+        kqsw_csum_t   senders_csum;
+        kqsw_csum_t   payload_csum = 0;
+        kqsw_csum_t   hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]),
+                                           sizeof(ptl_hdr_t));
+        size_t        csum_len = mlen;
+        int           csum_frags = 0;
+        int           csum_nob = 0;
+        static atomic_t csum_counter;
+        int           csum_verbose = (atomic_read(&csum_counter)%1000001) == 0;
+
+        atomic_inc (&csum_counter);
+
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                                sizeof (ptl_hdr_t), sizeof (kqsw_csum_t));
+        if (senders_csum != hdr_csum)
+                kqswnal_csum_error (krx, 1);
+#endif
+        CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
+
+        /* What was actually received must be >= payload.
+         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
+        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        LASSERT (mlen <= rlen);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+        
+        if (mlen != 0)
+        {
+                page     = 0;
+                page_ptr = ((char *) page_address(krx->krx_pages[0])) +
+                        KQSW_HDR_SIZE;
+                page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
+
+                LASSERT (niov > 0);
+                if (kiov != NULL) {
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                        iov_nob = kiov->kiov_len;
+                } else {
+                        iov_ptr = iov->iov_base;
+                        iov_nob = iov->iov_len;
+                }
+
+                for (;;)
+                {
+                        /* We expect the iov to exactly match mlen */
+                        LASSERT (iov_nob <= mlen);
+                        
+                        frag = MIN (page_nob, iov_nob);
+                        memcpy (iov_ptr, page_ptr, frag);
+#if KQSW_CHECKSUM
+                        payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
+                        csum_nob += frag;
+                        csum_frags++;
+#endif
+                        mlen -= frag;
+                        if (mlen == 0)
+                                break;
+
+                        page_nob -= frag;
+                        if (page_nob != 0)
+                                page_ptr += frag;
+                        else
+                        {
+                                page++;
+                                LASSERT (page < krx->krx_npages);
+                                page_ptr = page_address(krx->krx_pages[page]);
+                                page_nob = PAGE_SIZE;
+                        }
+
+                        iov_nob -= frag;
+                        if (iov_nob != 0)
+                                iov_ptr += frag;
+                        else if (kiov != NULL) {
+                                kunmap (kiov->kiov_page);
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                                iov_nob = kiov->kiov_len;
+                        } else {
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = iov->iov_base;
+                                iov_nob = iov->iov_len;
+                        }
+                }
+
+                if (kiov != NULL)
+                        kunmap (kiov->kiov_page);
+        }
+
+#if KQSW_CHECKSUM
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t));
+
+        if (csum_len != rlen)
+                CERROR("Unable to checksum data in user's buffer\n");
+        else if (senders_csum != payload_csum)
+                kqswnal_csum_error (krx, 0);
+
+        if (csum_verbose)
+                CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, "
+                       "csum_nob %d\n",
+                        hdr_csum, payload_csum, csum_frags, csum_nob);
+#endif
+        lib_finalize(nal, private, cookie);
+
+        kqswnal_requeue_rx (krx);
+
+        return (rlen);
+}
+
+static int
+kqswnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+static int
+kqswnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+int
+kqswnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kqswnal_data.kqn_nthreads);
+        return (0);
+}
+
+void
+kqswnal_thread_fini (void)
+{
+        atomic_dec (&kqswnal_data.kqn_nthreads);
+}
+
+int
+kqswnal_scheduler (void *arg)
+{
+        kqswnal_rx_t    *krx;
+        kqswnal_tx_t    *ktx;
+        kpr_fwd_desc_t  *fwd;
+        long             flags;
+        int              rc;
+        int              counter = 0;
+        int              did_something;
+
+        kportal_daemonize ("kqswnal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        while (!kqswnal_data.kqn_shuttingdown)
+        {
+                did_something = FALSE;
+
+                if (!list_empty (&kqswnal_data.kqn_readyrxds))
+                {
+                        krx = list_entry(kqswnal_data.kqn_readyrxds.next,
+                                         kqswnal_rx_t, krx_list);
+                        list_del (&krx->krx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        kqswnal_rx (krx);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
+                {
+                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
+                                         kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        rc = kqswnal_launch (ktx);
+                        if (rc != 0)          /* failed: ktx_nid down? */
+                        {
+                                CERROR("Failed delayed transmit to "LPX64
+                                       ": %d\n", ktx->ktx_nid, rc);
+                                kqswnal_tx_done (ktx, rc);
+                        }
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
+                {
+                        fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
+                        list_del (&fwd->kprfd_list);
+                        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+                        kqswnal_fwd_packet (NULL, fwd);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                    /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == KQSW_RESCHED) {
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
+                                                               kqswnal_data.kqn_shuttingdown ||
+                                                               !list_empty(&kqswnal_data.kqn_readyrxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedtxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedfwds));
+                                LASSERT (rc == 0);
+                        } else if (current->need_resched)
+                                schedule ();
+
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        kqswnal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t kqswnal_lib =
+{
+        nal_data:       &kqswnal_data,         /* NAL private data */
+        cb_send:        kqswnal_send,
+        cb_send_pages:  kqswnal_send_pages,
+        cb_recv:        kqswnal_recv,
+        cb_recv_pages:  kqswnal_recv_pages,
+        cb_read:        kqswnal_read,
+        cb_write:       kqswnal_write,
+        cb_malloc:      kqswnal_malloc,
+        cb_free:        kqswnal_free,
+        cb_printf:      kqswnal_printf,
+        cb_cli:         kqswnal_cli,
+        cb_sti:         kqswnal_sti,
+        cb_dist:        kqswnal_dist
+};
diff --git a/lustre/portals/knals/scimacnal/Makefile.am b/lustre/portals/knals/scimacnal/Makefile.am
new file mode 100644 (file)
index 0000000..6da31f0
--- /dev/null
@@ -0,0 +1,11 @@
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kscimacnal
+modulenet_DATA = kscimacnal.o
+EXTRA_PROGRAMS = kscimacnal
+
+DEFS =
+kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h
diff --git a/lustre/portals/knals/scimacnal/README.scimacnal b/lustre/portals/knals/scimacnal/README.scimacnal
new file mode 100644 (file)
index 0000000..d4c6a49
--- /dev/null
@@ -0,0 +1,14 @@
+
+scimacnal - A NAL for the Scali ScaMAC midlayer.
+
+The ScaMAC midlayer is a simplified API to the SCI high performance
+interconnect.
+
+In order to use this NAL you'll need to tune scimac to use larger buffers.
+See scimac.conf in this directory for an example.
+
+Overall performance and stability isn't great but this can be attributed
+to the scimac driver which apparently is in need of some development.
+
+TODO:
+Routing isn't yet implemented.
diff --git a/lustre/portals/knals/scimacnal/scimac.conf b/lustre/portals/knals/scimacnal/scimac.conf
new file mode 100644 (file)
index 0000000..bfb6d02
--- /dev/null
@@ -0,0 +1,35 @@
+#  Configuration file for the scimac driver - lustre friendly settings
+#
+
+#  The maximal number of message headers to use in the system.
+scimac_max_no_hdrs = 32
+
+#  The maximal number of eager buffers to use in the system.
+scimac_max_no_ebufs = 8
+
+#  The maximal size in bytes of each eager buffer.
+scimac_max_ebuf_size = 65536
+
+#  Enable use of a kernel thread to defer reception of packets.
+#  Default is to use a tasklet (sw interrupt).
+scimac_use_ulevel_recv = 1
+
+#  The maximal number of packets queued for transfer per path at any one time. 
+scimac_max_send_queuelen = 2000
+
+#  The packet retransmit time in milliseconds.
+#  The time elapsed since a packet was attempted sent until the packet is resent.
+scimac_pkt_rexmit_time = 200
+
+#  The packet's maximal retransmit time in milliseconds.
+#  The total time that a packet will be attempted sent before it is dropped.
+scimac_max_rexmit_time = 5000
+
+#  The lowest valid node identifier in the system.
+scimac_min_nodeid_number = 0x100
+
+#  The largest valid node identifier in the system.
+scimac_max_nodeid_number = 0xff00
+
+#  The incremental nodeid step in the system.
+scimac_nodeid_increment = 0x100
diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c
new file mode 100644 (file)
index 0000000..1066d69
--- /dev/null
@@ -0,0 +1,219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ * Based on gmnal, which is based on ksocknal and qswnal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "scimacnal.h"
+
+ptl_handle_ni_t kscimacnal_ni;
+nal_t  kscimacnal_api;
+
+kscimacnal_data_t kscimacnal_data;
+
+kpr_nal_interface_t kscimacnal_router_interface = {
+        kprni_nalid:    SCIMACNAL,
+        kprni_arg:      NULL,
+        kprni_fwd:      kscimacnal_fwd_packet,
+};
+
+
+static int kscimacnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */
+        return PTL_OK;
+}
+
+
+static void kscimacnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+
+static void kscimacnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+
+static int kscimacnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kscimacnal_api);
+        return 0;
+}
+
+
+static void kscimacnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kscimacnal_api);
+
+        if (current->need_resched) 
+                schedule();
+        return;
+}
+
+
+static nal_t *kscimacnal_init(int interface, ptl_pt_index_t  ptl_size,
+                ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        int     nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids);
+        lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); 
+        return &kscimacnal_api;
+}
+
+
+/* Called by kernel at module unload time */
+static void __exit 
+kscimacnal_finalize(void)
+{
+        /* FIXME: How should the shutdown procedure really look? */
+        kscimacnal_data.ksci_shuttingdown=1;
+
+        PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni);
+
+        PtlNIFini(kscimacnal_ni);
+        lib_fini(&kscimacnal_lib);
+
+        mac_finish(kscimacnal_data.ksci_machandle);
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+
+/* Called by kernel at module insertion time */
+static int __init
+kscimacnal_initialize(void)
+{
+        int rc;
+        unsigned long     nid=0;
+        mac_handle_t    *machandle = NULL;
+
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kscimacnal_api.forward = kscimacnal_forward;
+        kscimacnal_api.shutdown = kscimacnal_shutdown;
+        kscimacnal_api.yield = kscimacnal_yield;
+        kscimacnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kscimacnal_api.lock= kscimacnal_lock;
+        kscimacnal_api.unlock= kscimacnal_unlock;
+        kscimacnal_api.nal_data = &kscimacnal_data;
+
+        kscimacnal_lib.nal_data = &kscimacnal_data;
+
+        memset(&kscimacnal_data, 0, sizeof(kscimacnal_data));
+
+        kscimacnal_data.ksci_cb = &kscimacnal_lib;
+
+        /* We're not using this, but cli/sti callbacks does... ??? */
+        spin_lock_init(&kscimacnal_data.ksci_dispatch_lock);
+
+        /* FIXME: We only support one adapter for now */
+        machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx,
+                        &kscimacnal_data);
+
+        if(!machandle) {
+                CERROR("mac_init() failed\n");
+                return -1;
+        }
+
+        kscimacnal_data.ksci_machandle = machandle;
+
+        /* Make sure the scimac MTU is tuned */
+        if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) {
+                CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n",
+                                mac_get_mtusize(machandle), SCIMACNAL_MTU);
+                CERROR("Consult README.scimacnal for more information\n");
+                mac_finish(machandle);
+                return -1;
+        }
+
+        /* Get the node ID */
+        /* mac_get_physaddrlen() is a function instead of define, sigh */
+        LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid));
+        if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) {
+                CERROR("mac_get_physaddr() failed\n");
+                mac_finish(machandle);
+                return -1;
+        }
+        nid = ntohl(nid);
+        kscimacnal_data.ksci_nid = nid;
+
+
+        /* Initialize Network Interface */
+        /* FIXME: What do the magic numbers mean? Documentation anyone? */
+        rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                mac_finish(machandle);
+                return (-ENOMEM);
+        }
+
+        PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
+
+        /* We're done now, it's OK for the RX callback to do stuff */
+        kscimacnal_data.ksci_init = 1;
+
+        return 0;
+}
+
+
+MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_LICENSE("GPL");
+
+module_init (kscimacnal_initialize);
+module_exit (kscimacnal_finalize);
+
+EXPORT_SYMBOL(kscimacnal_ni);
diff --git a/lustre/portals/knals/scimacnal/scimacnal.h b/lustre/portals/knals/scimacnal/scimacnal.h
new file mode 100644 (file)
index 0000000..1ff180e
--- /dev/null
@@ -0,0 +1,85 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+ */
+
+
+#ifndef _SCIMACNAL_H
+#define _SCIMACNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <asm/page.h>            /* For PAGE_SIZE */
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <scamac.h>
+
+#ifndef MAC_SAPID_LUSTRE
+#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
+#endif /* MAC_SAPID_LUSTRE */
+
+#define SCIMACNAL_MTU 65536
+/* FIXME: What is really the MTU of lustre? */
+#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#endif
+
+typedef struct {
+        mac_handle_t    *handle;
+        mac_mblk_t      *msg;
+        mac_msg_type_t   type;
+        void            *userdata;
+}  kscimacnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t        *ktx_nal;
+        void            *ktx_private;
+        lib_msg_t       *ktx_cookie;
+        ptl_hdr_t       ktx_hdr;
+}  kscimacnal_tx_t;
+
+
+typedef struct {
+        char              ksci_init;
+        char              ksci_shuttingdown;
+        ptl_nid_t         ksci_nid;
+        nal_cb_t         *ksci_cb;
+        spinlock_t        ksci_dispatch_lock;
+        mac_handle_t     *ksci_machandle;
+}  kscimacnal_data_t;
+
+extern kscimacnal_data_t   kscimacnal_data;
+extern nal_t            kscimacnal_api;
+extern nal_cb_t         kscimacnal_lib;
+
+void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata);
+
+
+#endif  /* _SCIMACNAL_H */
diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c
new file mode 100644 (file)
index 0000000..7e4a2e8
--- /dev/null
@@ -0,0 +1,468 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "scimacnal.h"
+
+static int 
+kscimacnal_read (nal_cb_t *nal, void *private,
+                void *dst_addr, user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static int 
+kscimacnal_write(nal_cb_t *nal, void *private,
+                user_ptr dst_addr, void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static void *
+kscimacnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+
+static void 
+kscimacnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+
+static void 
+kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list         ap;
+        char msg[256]; 
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void 
+kscimacnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static void 
+kscimacnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data; 
+
+        spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static int 
+kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* FIXME: Network distance has a meaning, but is there no easy
+         * way to figure it out (depends on routing) */
+
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+
+static
+char * get_mac_error(mac_status_t status) 
+{
+        switch(status) {
+                case MAC_MSG_STAT_OK:
+                        return "MAC_MSG_STAT_OK";
+                case MAC_MSG_STAT_FREED:
+                        return "MAC_MSG_STAT_FREED";
+                case MAC_MSG_STAT_ABORTED:
+                        return "MAC_MSG_STAT_ABORTED";
+                case MAC_MSG_STAT_TIMEDOUT:
+                        return "MAC_MSG_STAT_TIMEDOUT";
+                case MAC_MSG_STAT_NODEUNREACH:
+                        return "MAC_MSG_STAT_NODEUNREACH";
+                case MAC_MSG_STAT_NETDOWN:
+                        return "MAC_MSG_STAT_NETDOWN";
+                case MAC_MSG_STAT_RESET:
+                        return "MAC_MSG_STAT_RESET";
+                case MAC_MSG_STAT_INITFAILED:
+                        return "MAC_MSG_STAT_INITFAILED";
+                case MAC_MSG_STAT_SYNCFAILED:
+                        return "MAC_MSG_STAT_SYNCFAILED";
+                case MAC_MSG_STAT_BADPROTO:
+                        return "MAC_MSG_STAT_BADPROTO";
+                case MAC_MSG_STAT_NOBUFSPACE:
+                        return "MAC_MSG_STAT_NOBUFSPACE";
+                case MAC_MSG_STAT_CONGESTION:
+                        return "MAC_MSG_STAT_CONGESTION";
+                case MAC_MSG_STAT_OTHER:
+                        return "MAC_MSG_STAT_OTHER";
+                default:
+                        return "Unknown error";
+        }
+}
+
+
+/* FIXME add routing code here ? */
+
+/* Called by ScaMac when transmission is complete  (ie. message is released) */
+static void 
+kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
+{
+        kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
+        int err=0;
+        
+        LASSERT (ktx != NULL);
+
+        /* Euh, there is no feedback when transmission fails?! */
+        switch(status) {
+                case MAC_MSG_STAT_OK:        /* normal */
+                        break;
+                default:
+                        CERROR("%s (%d):\n", get_mac_error(status), status);
+                        err = -EIO;
+                        break;
+        }
+
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+
+        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+}
+
+
+/* Called by portals when it wants to send a message.
+ * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
+static int 
+kscimacnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type, 
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           unsigned int     payload_niov,
+           struct iovec    *payload_iov,
+           size_t           payload_len)
+{
+        kscimacnal_tx_t    *ktx=NULL;
+        kscimacnal_data_t  *ksci = nal->nal_data;
+        int              rc=0;
+        int              buf_len = sizeof(ptl_hdr_t) + payload_len;
+        mac_mblk_t      *msg=NULL, *lastblk, *newblk;
+        unsigned long   physaddr;
+        
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, nid, payload_niov);
+
+        LASSERT(ksci != NULL);
+
+        LASSERT(hdr != NULL);
+
+        /* Do real check if we can send this */
+        if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
+                CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
+                                mac_get_mtusize(ksci->ksci_machandle));
+                return -EINVAL;
+        }
+
+
+        /* save transaction info for later finalize and cleanup */
+        PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
+        if (!ktx) {
+                return -ENOMEM;
+        }
+
+        /* *SIGH* hdr is a stack variable in the calling function, so we
+         * need to copy it to a buffer. Zerocopy magic (or is it just
+         * deferred memcpy?) is annoying sometimes.  */
+        memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t));
+
+        /* First, put the header in the main message mblk */
+        msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t),
+                        kscimacnal_txrelease, ktx);
+        if (!msg) {
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return -ENOMEM;
+        }
+        mac_put_mblk(msg, sizeof(ptl_hdr_t));
+        lastblk=msg;
+
+        /* Allocate additional mblks for each iov as needed.
+         * Essentially lib_copy_iov2buf with a twist or two */
+        while (payload_len > 0)
+        {
+                ptl_size_t nob;
+
+                LASSERT (payload_niov > 0);
+
+                nob = MIN (payload_iov->iov_len, payload_len);
+
+                /* We don't need a callback on the additional mblks, since
+                 * all release callbacks seems to be called when the entire
+                 * message has been sent */
+                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
+                if(!newblk) {
+                        mac_free_msg(msg);
+                        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                        return -ENOMEM;
+                }
+                mac_put_mblk(newblk, nob);
+                mac_link_mblk(lastblk, newblk);
+                lastblk=newblk;
+
+                payload_len -= nob;
+                payload_niov--;
+                payload_iov++;
+        }
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+
+        CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid);
+
+        physaddr = htonl(nid);
+
+        if((rc=mac_send(ksci->ksci_machandle, msg,
+                                        (mac_physaddr_t *) &physaddr))) {
+                CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
+                mac_free_msg(msg);
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return rc;
+        }
+
+        return 0;
+}
+
+
+void
+kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+/* Process a received portals packet */
+/* Called by the ScaMac RX thread when a packet is received */
+void
+kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
+                void *userdata)
+{
+        ptl_hdr_t       *hdr = NULL;
+        kscimacnal_rx_t     krx; 
+        mac_size_t       size;
+        kscimacnal_data_t  *ksci = userdata;
+
+        LASSERT(ksci != NULL);
+
+        if ( !ksci->ksci_init || ksci->ksci_shuttingdown || 
+                    type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) {
+                /* We're not interested in messages not for us, ignore */
+                mac_free_msg(msg);
+                return;
+        }
+
+        size = mac_msg_size(msg);
+
+        CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", 
+                        msg, type, size, mac_msg_mblks(msg));
+
+        if( size < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (ksci->ksci_shuttingdown)
+                        return;
+                CERROR("kscimacnal: did not receive complete portal header,"
+                                "size= %ld\n", size);
+                /* Free the message before exiting */
+                mac_free_msg(msg);
+                return;
+        }
+
+        /* Provide everything we know */
+        krx.handle = handle;
+        krx.msg = msg;
+        krx.type = type;
+        krx.userdata = userdata;
+
+        /* mac_msg_next returns the next mblk with unread data */
+        hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) );
+
+        if(!hdr) {
+                CERROR("kscimacnal: no data block in message %p\n", msg);
+                mac_free_msg(msg);
+                return;
+        }
+
+        if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc and calls our callback */
+                lib_parse(&kscimacnal_lib, hdr, &krx);
+                PROF_FINISH(lib_parse);
+#if 0 /* FIXME: Is it possible to detect this? */
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx:"
+                                "target is a  peer\n",
+                                hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+#endif /* if 0 FIXME */
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n",
+                                kscimacnal_lib.ni.nid, hdr->dest_nid);
+        }
+
+        mac_free_msg(msg);
+
+        CDEBUG(D_NET, "msg %p: Done\n", msg);
+}
+
+
+/* Called by portals to process a recieved packet */
+static int kscimacnal_recv(nal_cb_t     *nal, 
+                      void         *private, 
+                      lib_msg_t    *cookie, 
+                      unsigned int  niov, 
+                      struct iovec *iov, 
+                      size_t        mlen, 
+                      size_t        rlen)
+{
+        kscimacnal_rx_t    *krx = private;
+        mac_mblk_t      *mblk;
+        void            *src;
+        mac_size_t       pkt_len;
+        ptl_size_t       iovused=0;
+
+        LASSERT (krx != NULL);
+        LASSERT (krx->msg != NULL);
+
+        CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n",
+                        krx->msg, mlen, rlen, niov);
+
+        /* What was actually received must be >= what sender claims to have
+         * sent.  This is an LASSERT, since lib-move doesn't check cb return
+         * code yet. Also, rlen seems to be negative when mlen==0 so don't
+         * assert on that.
+         */
+        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
+        LASSERT (mlen==0 || mlen <= rlen);
+
+        PROF_START(memcpy);
+
+        /* mac_msg_next returns next mblk with unread data (ie. can
+         * be same mblk */
+        while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) {
+                pkt_len = mac_mblk_len(mblk);
+                src = mac_get_mblk(mblk, pkt_len); /* Next unread block */
+
+                CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld  src: %p\n",
+                                krx->msg, mblk, pkt_len, src);
+
+                LASSERT(src != NULL);
+
+                /* Essentially lib_copy_buf2iov but with continuation support,
+                 * we "gracefully" thrash the argument vars ;) */
+                while (pkt_len > 0) {
+                        ptl_size_t nob;
+
+                        LASSERT (niov > 0);
+
+                        LASSERT(iovused < iov->iov_len);
+
+                        nob = MIN (iov->iov_len-iovused, pkt_len);
+                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                                        "iovused: %d\n",
+                                        iov->iov_base, iov->iov_len,
+                                        src, nob, iovused);
+
+                        memcpy (iov->iov_base+iovused, src, nob);
+                        pkt_len -= nob;
+                        src += nob;
+
+                        if(nob+iovused < iov->iov_len) {
+                                /* We didn't use all of the iov */
+                                iovused+=nob;
+                        }
+                        else {
+                                niov--;
+                                iov++;
+                                iovused=0;
+                        }
+                }
+        }
+        PROF_FINISH(memcpy);
+
+        CDEBUG(D_NET, "Calling lib_finalize.\n");
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        CDEBUG(D_NET, "Done.\n");
+
+        return rlen;
+}
+
+
+nal_cb_t kscimacnal_lib = {
+        nal_data:       &kscimacnal_data,               /* NAL private data */
+        cb_send:         kscimacnal_send,
+        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_recv:         kscimacnal_recv,
+        cb_recv_pages:   NULL,
+        cb_read:         kscimacnal_read,
+        cb_write:        kscimacnal_write,
+        cb_malloc:       kscimacnal_malloc,
+        cb_free:         kscimacnal_free,
+        cb_printf:       kscimacnal_printf,
+        cb_cli:          kscimacnal_cli,
+        cb_sti:          kscimacnal_sti,
+        cb_dist:         kscimacnal_dist
+};
diff --git a/lustre/portals/knals/socknal/Makefile.am b/lustre/portals/knals/socknal/Makefile.am
new file mode 100644 (file)
index 0000000..437d7fc
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ksocknal
+modulenet_DATA = ksocknal.o
+EXTRA_PROGRAMS = ksocknal
+
+DEFS =
+ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h
diff --git a/lustre/portals/knals/socknal/Makefile.mk b/lustre/portals/knals/socknal/Makefile.mk
new file mode 100644 (file)
index 0000000..46edf01
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Kernelenv
+
+obj-y += ksocknal.o
+ksocknal-objs    := socknal.o socknal_cb.o
+
diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c
new file mode 100644 (file)
index 0000000..d15d8c8
--- /dev/null
@@ -0,0 +1,863 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+ptl_handle_ni_t         ksocknal_ni;
+static nal_t            ksocknal_api;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ksock_nal_data_t ksocknal_data;
+#else
+static ksock_nal_data_t ksocknal_data;
+#endif
+
+kpr_nal_interface_t ksocknal_router_interface = {
+        kprni_nalid:      SOCKNAL,
+        kprni_arg:        &ksocknal_data,
+        kprni_fwd:        ksocknal_fwd_packet,
+};
+
+
+int
+ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */
+        return PTL_OK;
+}
+
+int
+ksocknal_api_shutdown(nal_t *nal, int ni)
+{
+        CDEBUG (D_NET, "closing all connections\n");
+
+        return ksocknal_close_sock(0);          /* close all sockets */
+}
+
+void
+ksocknal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ksocknal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ksocknal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ksocknal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ksocknal_data.ksnd_mynid);
+        lib_init(&ksocknal_lib, ksocknal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ksocknal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ksocknal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ksocknal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->nid);
+
+        ksocknal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+void
+ksocknal_bind_irq (unsigned int irq, int cpu)
+{
+#if (defined(CONFIG_SMP) && CPU_AFFINITY)
+        char  cmdline[64];
+        char *argv[] = {"/bin/sh",
+                        "-c",
+                        cmdline,
+                        NULL};
+        char *envp[] = {"HOME=/",
+                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                        NULL};
+
+        snprintf (cmdline, sizeof (cmdline),
+                  "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
+
+        printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+                irq, cpu, cmdline);
+
+        /* FIXME: Find a better method of setting IRQ affinity...
+         */
+
+        call_usermodehelper (argv[0], argv, envp);
+#endif
+}
+
+int
+ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        ksock_sched_t     *sched = NULL;
+        unsigned int       irq = 0;
+        struct net_device *dev = NULL;
+        int                ret;
+        int                idx;
+        ENTRY;
+
+        LASSERT (!in_interrupt());
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_saved_data_ready = sock->sk->data_ready;
+        conn->ksnc_saved_write_space = sock->sk->write_space;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ksocknal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+#warning check it is OK to derefence sk->dst_cache->dev like this...
+        lock_sock (conn->ksnc_sock->sk);
+
+        if (conn->ksnc_sock->sk->dst_cache != NULL) {
+                dev = conn->ksnc_sock->sk->dst_cache->dev;
+                if (dev != NULL) {
+                        irq = dev->irq;
+                        if (irq >= NR_IRQS) {
+                                CERROR ("Unexpected IRQ %x\n", irq);
+                                irq = 0;
+                        }
+                }
+        }
+
+        release_sock (conn->ksnc_sock->sk);
+
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (irq == 0 ||
+            ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) {
+                /* This is a software NIC, or we haven't associated it with
+                 * a CPU yet */
+
+                /* Choose the CPU with the fewest connections */
+                sched = ksocknal_data.ksnd_schedulers;
+                for (idx = 1; idx < SOCKNAL_N_SCHED; idx++)
+                        if (sched->kss_nconns >
+                            ksocknal_data.ksnd_schedulers[idx].kss_nconns)
+                                sched = &ksocknal_data.ksnd_schedulers[idx];
+
+                if (irq != 0) {                 /* Hardware NIC */
+                        /* Remember which scheduler we chose */
+                        idx = sched - ksocknal_data.ksnd_schedulers;
+
+                        LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK);
+
+                        if (bind_irq)       /* remember if we will bind below */
+                                idx |= SOCKNAL_IRQ_BOUND;
+
+                        ksocknal_data.ksnd_irq_info[irq] = idx;
+                }
+        } else { 
+                /* This is a hardware NIC, associated with a CPU */
+                idx = ksocknal_data.ksnd_irq_info[irq];
+
+                /* Don't bind again if we've bound already */
+                if ((idx & SOCKNAL_IRQ_BOUND) != 0)
+                        bind_irq = 0;
+                
+                sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK];
+        }
+
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+        list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist);
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (bind_irq &&                         /* irq binding required */
+            irq != 0)                           /* hardware NIC */
+                ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers);
+
+        /* NOW it's safe to get called back when socket is ready... */
+        sock->sk->user_data = conn;
+        sock->sk->data_ready = ksocknal_data_ready;
+        sock->sk->write_space = ksocknal_write_space;
+
+        /* ...which I call right now to get things going */
+        ksocknal_data_ready (sock->sk, 0);
+        ksocknal_write_space (sock->sk);
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ksocknal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0) {                         /* close ALL connections */
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ksocknal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ksocknal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid && list_empty (&death_row))
+                return (-ENOENT);
+
+        while (!list_empty (&death_row)) {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+
+                /* NB I _have_ to restore the callback, rather than storing
+                 * a noop, since the socket could survive past this module
+                 * being unloaded!! */
+                conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready;
+                conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space;
+
+                /* OK; no more callbacks, but they could be in progress now,
+                 * so wait for them to complete... */
+                write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+                /* ...however if I get the lock before a callback gets it,
+                 * this will make them noop
+                 */
+                conn->ksnc_sock->sk->user_data = NULL;
+
+                /* And drop the scheduler's connection count while I've got
+                 * the exclusive lock */
+                conn->ksnc_scheduler->kss_nconns--;
+
+                write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock,
+                                        flags);
+
+                ksocknal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        }
+
+        return (0);
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        return &(sk->tp_pinfo.af_tcp);
+}
+#else
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        struct tcp_sock *s = (struct tcp_sock *)sk;
+        return &s->tcp;
+}
+#endif
+
+void
+ksocknal_push_conn (ksock_conn_t *conn)
+{
+        struct sock    *sk = conn->ksnc_sock->sk;
+        struct tcp_opt *tp = sock2tcp_opt(sk);
+        int             nonagle;
+        int             val = 1;
+        int             rc;
+        mm_segment_t    oldmm;
+
+        lock_sock (sk);
+        nonagle = tp->nonagle;
+        tp->nonagle = 1;
+        release_sock (sk);
+
+        oldmm = get_fs ();
+        set_fs (KERNEL_DS);
+
+        rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                   (char *)&val, sizeof (val));
+        LASSERT (rc == 0);
+
+        set_fs (oldmm);
+
+        lock_sock (sk);
+        tp->nonagle = nonagle;
+        release_sock (sk);
+}
+
+/* Passing in a zero nid pushes all connections */
+int
+ksocknal_push_sock (ptl_nid_t nid)
+{
+        ksock_conn_t      *conn;
+        struct list_head  *tmp;
+        int                index;
+        int                i;
+
+        if (nid != 0) {
+                conn = ksocknal_get_conn (nid);
+
+                if (conn == NULL)
+                        return (-ENOENT);
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+
+                return (0);
+        }
+
+        /* NB we can't remove connections from the socket list so we have to
+         * cope with them being removed from under us...
+         */
+        for (index = 0; ; index++) {
+                read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+                i = 0;
+                conn = NULL;
+
+                list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+                        if (i++ == index) {
+                                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                                atomic_inc (&conn->ksnc_refcount); // take a ref
+                                break;
+                        }
+                }
+
+                read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+        }
+
+        return (0);
+}
+
+ksock_conn_t *
+ksocknal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n",
+               nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ksocknal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ksocknal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready);
+        LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space);
+        LASSERT (conn->ksnc_sock->sk->user_data == NULL);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt()) {
+                ksocknal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list);
+        wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+}
+
+int
+ksocknal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd,
+                                       data->ioc_flags);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ksocknal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ksocknal_set_mynid (data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_PUSH_CONNECTION: {
+                rc = ksocknal_push_sock (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+void
+ksocknal_free_buffers (void)
+{
+        if (ksocknal_data.ksnd_fmbs != NULL) {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0;
+                     i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS);
+                     i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ksocknal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                     SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ksocknal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS +
+                                                     SOCKNAL_NNBLK_LTXS));
+
+        if (ksocknal_data.ksnd_schedulers != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_schedulers,
+                             sizeof (ksock_sched_t) * SOCKNAL_N_SCHED);
+}
+
+void __exit
+ksocknal_module_fini (void)
+{
+        int   i;
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(SOCKNAL);
+                PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ksocknal_ni);
+                lib_fini(&ksocknal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ksocknal_data.ksnd_socklist));
+                LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                                ksock_sched_t *kss =
+                                        &ksocknal_data.ksnd_schedulers[i];
+
+                                LASSERT (list_empty (&kss->kss_tx_conns));
+                                LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (kss->kss_nconns == 0);
+                        }
+
+                /* stop router calling me */
+                kpr_shutdown (&ksocknal_data.ksnd_router);
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ksocknal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ksocknal_data.ksnd_reaper_waitq);
+
+                for (i = 0; i < SOCKNAL_N_SCHED; i++)
+                       wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq);
+
+                while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ksocknal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ksocknal_data.ksnd_router);
+
+                ksocknal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+
+int __init
+ksocknal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ksocknal_api.forward  = ksocknal_api_forward;
+        ksocknal_api.shutdown = ksocknal_api_shutdown;
+        ksocknal_api.yield    = ksocknal_api_yield;
+        ksocknal_api.validate = NULL;           /* our api validate is a NOOP */
+        ksocknal_api.lock     = ksocknal_api_lock;
+        ksocknal_api.unlock   = ksocknal_api_unlock;
+        ksocknal_api.nal_data = &ksocknal_data;
+
+        ksocknal_lib.nal_data = &ksocknal_data;
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist);
+        rwlock_init(&ksocknal_data.ksnd_socklist_lock);
+
+        ksocknal_data.ksnd_nal_cb = &ksocknal_lib;
+        spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
+        INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+        memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED,
+                sizeof (ksocknal_data.ksnd_irq_info));
+
+        /* flag lists/ptrs/locks initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
+                     sizeof(ksock_sched_t) * SOCKNAL_N_SCHED);
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                RETURN(-ENOMEM);
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+
+                spin_lock_init (&kss->kss_lock);
+                INIT_LIST_HEAD (&kss->kss_rx_conns);
+                INIT_LIST_HEAD (&kss->kss_tx_conns);
+#if SOCKNAL_ZC
+                INIT_LIST_HEAD (&kss->kss_zctxdone_list);
+#endif
+                init_waitqueue_head (&kss->kss_waitq);
+        }
+
+        CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t),
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_ltxs,
+                     sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS));
+        if (ksocknal_data.ksnd_ltxs == NULL) {
+                ksocknal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ksocknal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ksocknal_data.ksnd_idle_ltx_list :
+                                &ksocknal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ksocknal_ni, ~0);
+
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                rc = ksocknal_thread_start (ksocknal_scheduler,
+                                            &ksocknal_data.ksnd_schedulers[i]);
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
+                               i, rc);
+                        ksocknal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ksocknal_thread_start (ksocknal_reaper, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ksocknal_data.ksnd_router,
+                          &ksocknal_router_interface);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't initialise routing interface "
+                       "(rc = %d): not routing\n", rc);
+        } else {
+                /* Only allocate forwarding buffers if I'm on a gateway */
+
+                PORTAL_ALLOC(ksocknal_data.ksnd_fmbs,
+                             sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                    SOCKNAL_LARGE_FWD_NMSGS));
+                if (ksocknal_data.ksnd_fmbs == NULL) {
+                        ksocknal_module_fini ();
+                        RETURN(-ENOMEM);
+                }
+
+                /* NULL out buffer pointers etc */
+                memset(ksocknal_data.ksnd_fmbs, 0,
+                       sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                              SOCKNAL_LARGE_FWD_NMSGS));
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
+                                 SOCKNAL_LARGE_FWD_NMSGS); i++) {
+                        ksock_fmb_t *fmb =
+                                &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i];
+
+                        if (i < SOCKNAL_SMALL_FWD_NMSGS) {
+                                fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp;
+                        } else {
+                                fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp;
+                        }
+
+                        LASSERT (fmb->fmb_npages > 0);
+                        for (j = 0; j < fmb->fmb_npages; j++) {
+                                fmb->fmb_pages[j] = alloc_page (GFP_KERNEL);
+
+                                if (fmb->fmb_pages[j] == NULL) {
+                                        ksocknal_module_fini ();
+                                        return (-ENOMEM);
+                                }
+
+                                LASSERT(page_address (fmb->fmb_pages[j]) !=
+                                        NULL);
+                        }
+
+                        list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+                }
+        }
+
+        rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                ksocknal_module_fini ();
+                return (rc);
+        }
+
+        PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
+               "mem %d)\n",
+               kpr_routing (&ksocknal_data.ksnd_router) ?
+               "enabled" : "disabled", pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
+
+EXPORT_SYMBOL (ksocknal_ni);
diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h
new file mode 100644 (file)
index 0000000..46ee3b7
--- /dev/null
@@ -0,0 +1,293 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_N_SCHED num_online_cpus()       /* # socknal schedulers */
+
+#if PTL_LARGE_MTU
+# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10)      /* biggest payload I can forward */
+#else
+# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)       /* biggest payload I can forward */
+#endif
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        spinlock_t        fmp_lock;             /* serialise */
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+
+typedef struct                                  /* per scheduler state */
+{
+        spinlock_t        kss_lock;             /* serialise */
+        struct list_head  kss_rx_conns;         /* conn waiting to be read */
+        struct list_head  kss_tx_conns;         /* conn waiting to be written */
+#if SOCKNAL_ZC
+        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
+#endif
+        wait_queue_head_t kss_waitq;            /* where scheduler sleeps */
+        int               kss_nconns;           /* # connections assigned to this scheduler */
+} ksock_sched_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        ksock_sched_t    *ksnd_schedulers;      /* scheduler state */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        spinlock_t        ksnd_idle_ltx_lock;   /* serialise ltx alloc/free */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        unsigned char     ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+#define SOCKNAL_IRQ_BOUND       0x80            /* flag we _did_ bind already */
+#define SOCKNAL_IRQ_SCHED_MASK 0x7f            /* we assume < 127 CPUs */
+#define SOCKNAL_IRQ_UNASSIGNED  0xff            /* flag unassigned */
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments and 0 or more ptl_kiov_t fragments.  Forwarded
+ * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0
+ * ptl_kiov_t fragments.  Messages from an MD with PTL_MD_KIOV set, have 1
+ * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t
+ * fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, if the message
+ * requires forwarding or will be received into mapped memory, up to
+ * PTL_MD_MAX_IOV struct iovec fragments describe the target memory.
+ * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used.
+ */
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;        /* queue on conn for transmission etc */
+        char                    tx_isfwd;       /* forwarding / sourced here */
+        int                     tx_nob;         /* # packet bytes */
+        int                     tx_niov;        /* # packet iovec frags */
+        struct iovec           *tx_iov;         /* packet iovec frags */
+        int                     tx_nkiov;       /* # packet page frags */
+        ptl_kiov_t             *tx_kiov;        /* packet page frags */
+#if SOCKNAL_ZC        
+        ksock_sched_t          *tx_sched;       /* who to wake on callback */
+        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
+#endif
+} ksock_tx_t;
+
+#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the tx frag descriptors: hdr is always 1 iovec
+ * and payload is PTL_MD_MAX of either type. */
+typedef struct
+{
+        struct iovec            hdr;
+        union {
+                struct iovec    iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+        }                       payload;
+} ksock_txiovspace_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        ksock_txiovspace_t      ltx_iov_space;  /* where to stash frag descriptors */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the address of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or PTL_MD_MAX_IOV frags of payload of either type. */
+typedef union {
+        struct iovec    iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* actual socket */
+        void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+        void               *ksnc_saved_write_space; /* socket's original write_space() callback */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        volatile int        ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # iovec frags */
+        struct iovec       *ksnc_rx_iov;        /* the iovec frags */
+        int                 ksnc_rx_nkiov;      /* # page frags */
+        ptl_kiov_t         *ksnc_rx_kiov;       /* the page frags */
+        ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        volatile int        ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+
+} ksock_conn_t;
+
+extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client);
+extern int ksocknal_close_sock(ptl_nid_t nid);
+extern int ksocknal_set_mynid(ptl_nid_t nid);
+extern int ksocknal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid);
+extern void _ksocknal_put_conn (ksock_conn_t *conn);
+extern void ksocknal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ksocknal_put_conn (conn);
+}
+
+extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern void ksocknal_data_ready(struct sock *sk, int n);
+extern void ksocknal_write_space(struct sock *sk);
+
+
+extern nal_cb_t         ksocknal_lib;
+extern ksock_nal_data_t ksocknal_data;
diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c
new file mode 100644 (file)
index 0000000..388554d
--- /dev/null
@@ -0,0 +1,1612 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+atomic_t   ksocknal_packets_received;
+atomic_t   ksocknal_packets_launched;
+atomic_t   ksocknal_packets_being_sent;
+
+#if SOCKNAL_ZC
+int        ksocknal_do_zc = 1;
+int        ksocknal_zc_min_frag = 2048;
+#endif
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                         ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ksocknal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ksocknal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ksocknal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ksocknal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ksocknal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ksocknal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ksocknal_get_ltx (int may_block)
+{
+        long             flags;
+        ksock_ltx_t *ltx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+                if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) {
+                        ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next,
+                                         ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) {
+                                ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next,
+                                                 ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock,
+                                       flags);
+
+                wait_event (ksocknal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ksocknal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        return (ltx);
+}
+
+#if SOCKNAL_ZC
+struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+int
+ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        struct iovec  *iov = tx->tx_iov;
+        int            fragsize = iov->iov_len;
+        unsigned long  vaddr = (unsigned long)iov->iov_base;
+#if SOCKNAL_ZC
+        int            offset = vaddr & (PAGE_SIZE - 1);
+        int            zcsize = MIN (fragsize, PAGE_SIZE - offset);
+        struct page   *page;
+#endif
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (tx->tx_niov > 0);
+        more |= (tx->tx_niov > 1);
+        
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            zcsize >= ksocknal_zc_min_frag &&
+            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
+                
+                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
+                       (void *)vaddr, page, page_address(page), offset, zcsize);
+
+                more |= (zcsize < fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, 
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* NB don't pass tx's iov; sendmsg may or may not update it */
+                struct iovec fragiov = { .iov_base = (void *)vaddr,
+                                         .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+        } 
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len  = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_iov++;
+        tx->tx_niov--;
+        return (1);
+}
+
+int
+ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        ptl_kiov_t    *kiov = tx->tx_kiov;
+        int            fragsize = kiov->kiov_len;
+        struct page   *page = kiov->kiov_page;
+        int            offset = kiov->kiov_offset;
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        LASSERT (tx->tx_nkiov > 0);
+        more |= (tx->tx_nkiov > 1);
+
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            fragsize >= ksocknal_zc_min_frag) {
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, fragsize,
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                char *addr = ((char *)kmap (page)) + offset;
+                struct iovec fragiov = {.iov_base = addr,
+                                        .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t  oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+                kunmap (page);
+        }
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len    = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_kiov++;
+        tx->tx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        int    rc;
+        int    sent_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt());
+
+        for (;;) {
+                if (tx->tx_niov != 0)
+                        rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0);
+                else
+                        rc = ksocknal_send_kiov (sock, tx, more);
+
+                /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */
+                if (rc <= 0)                    /* error or partial send */
+                        RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc);
+                
+                if (tx->tx_nob == 0)            /* sent everything */
+                        RETURN (0);
+
+                sent_some = 1;
+        }
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int           fragsize  = iov->iov_len;
+        unsigned long vaddr = (unsigned long)iov->iov_base;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+
+        if (rc <= 0)
+                return (rc);
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_iov++;
+        conn->ksnc_rx_niov--;
+        return (1);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct page  *page = kiov->kiov_page;
+        int           offset = kiov->kiov_offset;
+        int           fragsize = kiov->kiov_len;
+        unsigned long vaddr = ((unsigned long)kmap (page)) + offset;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+        kunmap (page);
+        
+        if (rc <= 0)
+                return (rc);
+        
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_kiov++;
+        conn->ksnc_rx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_recvmsg (ksock_conn_t *conn) 
+{
+        int    rc;
+        int    got_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt ());
+
+        for (;;) {
+                LASSERT (conn->ksnc_rx_nob_wanted > 0);
+                
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                /* CAVEAT EMPTOR: we return...
+                 * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */
+
+                if (rc <= 0)                    /* error/EOF or partial receive */
+                        RETURN ((got_some || rc == -EAGAIN) ? 1 : rc);
+                
+                if (conn->ksnc_rx_nob_wanted == 0)
+                        RETURN (1);
+
+                got_some = 0;
+        }
+}
+
+#if SOCKNAL_ZC
+void
+ksocknal_zc_callback (zccd_t *zcd)
+{
+        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
+        ksock_sched_t *sched = tx->tx_sched;
+        unsigned long  flags;
+        ENTRY;
+
+        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        EXIT;
+}
+#endif
+
+void
+ksocknal_tx_done (ksock_tx_t *tx)
+{
+        long           flags;
+        ksock_ltx_t   *ltx;
+        ENTRY;
+
+        atomic_dec (&ksocknal_packets_being_sent);
+
+        if (tx->tx_isfwd) {             /* was a forwarded packet? */
+                kpr_fwd_done (&ksocknal_data.ksnd_router,
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                EXIT;
+                return;
+        }
+
+        /* local send */
+        ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        list_add_tail (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+        /* normal tx desc => wakeup anyone blocking for one */
+        if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list &&
+            waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq))
+                wake_up (&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+        EXIT;
+}
+
+void
+ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_tx_t *tx;
+        int         rc;
+
+        LASSERT (!list_empty (&sched->kss_tx_conns));
+        conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
+        list_del (&conn->ksnc_tx_list);
+
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+        tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        /* assume transmit will complete now, so dequeue while I've got lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to write */
+
+        rc = ksocknal_sendmsg (conn->ksnc_sock, tx, 
+                               !list_empty (&conn->ksnc_tx_queue)); /* more to come? */
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc != 0) {
+#warning FIXME: handle socket errors properly
+                CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                /* kid on for now the whole packet went.
+                 * NB when we handle the error better, we'll still need to
+                 * block for zccd completion.
+                 */
+                tx->tx_nob = 0;
+        }
+
+        if (tx->tx_nob == 0)                    /* nothing left to send */
+        {
+                /* everything went; assume more can go, so prevent write_space locking */
+                conn->ksnc_tx_ready = 1;
+
+                ksocknal_put_conn (conn);       /* release packet's ref */
+                atomic_inc (&ksocknal_packets_being_sent);
+#if SOCKNAL_ZC
+                if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
+                        /* zccd skbufs are still in-flight.  Release my
+                         * initial ref on zccd, so callback can occur */
+                        zccd_put (&tx->tx_zccd);
+                } else
+#endif
+                        ksocknal_tx_done (tx);
+
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+        } else {
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                                 /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+}
+
+void
+ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        unsigned long  flags;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete portals header.
+         */
+        LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t));
+        
+        CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n",
+                ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, 
+                tx->tx_niov, tx->tx_nkiov);
+
+#if SOCKNAL_ZC
+        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
+        /* NB this sets 1 ref on zccd, so the callback can only occur
+         * after I've released this ref */
+        tx->tx_sched = sched;
+#endif
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled) {          /* not scheduled to send */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&sched->kss_waitq))
+                        wake_up (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        atomic_inc (&ksocknal_packets_launched);
+}
+
+ksock_conn_t *
+ksocknal_send_target (ptl_nid_t nid) 
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        int           rc;
+
+        if ((conn = ksocknal_get_conn (nid)) == NULL) {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        return (NULL);
+                }
+
+                if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64
+                                " is not a peer\n", nid, gatewaynid);
+                        return (NULL);
+                }
+        }
+
+        return (conn);
+}
+
+ksock_ltx_t *
+ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                    ptl_hdr_t *hdr, int type)
+{
+        ksock_ltx_t  *ltx;
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt ()));
+        if (ltx == NULL) {
+                CERROR ("Can't allocate tx desc\n");
+                return (NULL);
+        }
+
+        /* Init local send packet (storage for hdr, finalize() args) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+        
+        /* Init common ltx_tx */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr);
+
+        /* We always have 1 mapped frag for the header */
+        ltx->ltx_tx.tx_niov = 1;
+        ltx->ltx_tx.tx_iov = &ltx->ltx_iov_space.hdr;
+        ltx->ltx_tx.tx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        ltx->ltx_tx.tx_kiov  = NULL;
+        ltx->ltx_tx.tx_nkiov = 0;
+
+        return (ltx);
+}
+
+int
+ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t  *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it 
+         *
+         * Also, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL) {
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+        
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+        
+        /* append the payload_iovs to the one pointing at the header */
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+int
+ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL)
+                return (-1);
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                return (-1);
+        }
+
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov;
+        memcpy (ltx->ltx_tx.tx_kiov, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_nkiov = payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd,
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        /* I'm the gateway; must be the last hop */
+        if (nid == ksocknal_lib.ni.nid)
+                nid = fwd->kprfd_target_nid;
+
+        conn = ksocknal_get_conn (nid);
+        if (conn == NULL) {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                   /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+        tx->tx_nkiov = 0;
+        tx->tx_kiov  = NULL;
+        
+        ksocknal_launch_packet (conn, tx);
+}
+
+int
+ksocknal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ksocknal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ksocknal_thread_fini (void)
+{
+        atomic_dec (&ksocknal_data.ksnd_nthreads);
+}
+
+void
+ksocknal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn = NULL;
+        ksock_sched_t     *sched;
+        long               flags;
+
+        if (error != 0)
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),
+                       error);
+        else
+                CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
+                        NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+
+        spin_lock_irqsave (&fmp->fmp_lock, flags);
+
+        list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
+
+        if (!list_empty (&fmp->fmp_blocked_conns)) {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next,
+                                   ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+        }
+
+        spin_unlock_irqrestore (&fmp->fmp_lock, flags);
+
+        if (conn == NULL)
+                return;
+
+        CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+}
+
+ksock_fmb_t *
+ksocknal_get_idle_fmb (ksock_conn_t *conn)
+{
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        long              flags;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (ksocknal_data.ksnd_fmbs != NULL);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ksocknal_data.ksnd_small_fmp;
+        else
+                pool = &ksocknal_data.ksnd_large_fmp;
+
+        spin_lock_irqsave (&pool->fmp_lock, flags);
+
+        if (!list_empty (&pool->fmp_idle_fmbs)) {
+                fmb = list_entry(pool->fmp_idle_fmbs.next,
+                                 ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                spin_unlock_irqrestore (&pool->fmp_lock, flags);
+
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+
+        spin_unlock_irqrestore (&pool->fmp_lock, flags);
+        return (NULL);
+}
+
+
+int
+ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int niov;                               /* at least the header */
+        int nob;
+
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+
+        /* copy header */
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t));
+
+        if (payload_nob == 0) {         /* got complete packet already */
+                atomic_inc (&ksocknal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                              packet_nob, 1, fmb->fmb_iov,
+                              ksocknal_fmb_callback, fmb);
+
+                /* forward it now */
+                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE) {  /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        } else {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+
+                do {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base =
+                                page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                      packet_nob, niov, fmb->fmb_iov,
+                      ksocknal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
+                 sizeof (struct iovec));
+
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        conn->ksnc_rx_iov[0].iov_base =
+                (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) +
+                         sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len =
+                fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1],
+                       (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ksocknal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        ptl_nid_t     dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int           body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr));
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid),
+                dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        if (body_len < 0) {                 /* length corrupt (overflow) */
+                CERROR("dropping packet from "LPX64" for "LPX64": packet "
+                       "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid),
+                       dest_nid, body_len);
+                ksocknal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (ksocknal_data.ksnd_fmbs == NULL) {        /* not forwarding */
+                CERROR("dropping packet from "LPX64" for "LPX64": not "
+                       "forwarding\n", conn->ksnc_hdr.src_nid,
+                       conn->ksnc_hdr.dest_nid);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) {      /* too big to forward */
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": packet size %d too big\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid, body_len);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        /* should have gone direct */
+        conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid);
+        if (conn2 != NULL) {
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": target is a peer\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid);
+                ksocknal_put_conn (conn2);  /* drop ref from get above */
+
+                /* on to next packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                return (1);
+        }
+
+        /* Set up to skip as much a possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_fmb_t  *fmb;
+        int           rc;
+
+        /* NB: sched->ksnc_lock lock held */
+
+        LASSERT (!list_empty (&sched->kss_rx_conns));
+        conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list);
+        list_del (&conn->ksnc_rx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        CDEBUG(D_NET, "sched %p conn %p\n", sched, conn);
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* doesn't need a forwarding buffer */
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)
+                goto try_read;
+
+ get_fmb:
+        fmb = ksocknal_get_idle_fmb (conn);
+        if (fmb == NULL) {      /* conn descheduled waiting for idle fmb */
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+                return;
+        }
+
+        if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to read */
+
+        rc = ksocknal_recvmsg(conn);
+
+        if (rc == 0)
+                goto out;
+        if (rc < 0) {
+#warning FIXME: handle socket errors properly
+                CERROR ("Error socknal read %p: %d\n", conn, rc);
+                goto out;
+        }
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        /* got all I wanted, assume there's more - prevent data_ready locking */
+        conn->ksnc_rx_ready = 1;
+
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_HEADER:
+                /* It's not for me */
+                if (conn->ksnc_hdr.type != PTL_MSG_HELLO &&
+                    NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) {
+                        ksocknal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state) {
+                        case SOCKNAL_RX_HEADER: /* skipped (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                                LBUG ();
+                        }
+                        /* Not Reached */
+                }
+
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc */
+                lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ksocknal_packets_received);
+                /* packet is done now */
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        goto out;       /* come back later */
+                goto try_read;          /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        NTOH__u64 (conn->ksnc_hdr.dest_nid),
+                        conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ksocknal_packets_received);
+
+                /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */
+                kpr_fwd_start (&ksocknal_data.ksnd_router,
+                               (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                /* no slop in forwarded packets */
+                LASSERT (conn->ksnc_rx_nob_left == 0);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+        /* no data there to read? */
+        if (!conn->ksnc_rx_ready) {
+                /* let socket callback schedule again */
+                conn->ksnc_rx_scheduled = 0;
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                              /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+}
+
+int
+ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
+               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int
+ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_niov = 0;
+        conn->ksnc_rx_iov  = NULL;
+        conn->ksnc_rx_nkiov = niov;
+        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int ksocknal_scheduler (void *arg)
+{
+        ksock_sched_t     *sched = (ksock_sched_t *)arg;
+        unsigned long      flags;
+        int                rc;
+        int                nloops = 0;
+        int                id = sched - ksocknal_data.ksnd_schedulers;
+        char               name[16];
+#if (CONFIG_SMP && CPU_AFFINITY)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        int                cpu = cpu_logical_map(id % num_online_cpus());
+#else
+#warning "Take care of architecure specific logical APIC map"
+        int cpu = 1;    /* Have to change later. */
+#endif /* LINUX_VERSION_CODE */
+        
+        set_cpus_allowed (current, 1 << cpu);
+        id = cpu;
+#endif /* CONFIG_SMP && CPU_AFFINITY */
+
+        snprintf (name, sizeof (name),"ksocknald[%d]", id);
+        kportal_daemonize (name);
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&sched->kss_rx_conns)) {
+                        did_something = 1;
+                        /* drops & regains kss_lock */
+                        ksocknal_process_receive (sched, &flags);
+                }
+
+                if (!list_empty (&sched->kss_tx_conns)) {
+                        did_something = 1;
+                        /* drops and regains kss_lock */
+                        ksocknal_process_transmit (sched, &flags);
+                }
+#if SOCKNAL_ZC
+                if (!list_empty (&sched->kss_zctxdone_list)) {
+                        ksock_tx_t *tx =
+                                list_entry(sched->kss_zctxdone_list.next,
+                                           ksock_tx_t, tx_list);
+                        did_something = 1;
+
+                        list_del (&tx->tx_list);
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        ksocknal_tx_done (tx);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+#endif
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+#if SOCKNAL_ZC
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns) ||
+                                                               !list_empty(&sched->kss_zctxdone_list));
+#else
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns));
+#endif
+                                LASSERT (rc == 0);
+                        } else
+                               our_cond_resched();
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->data_ready != &ksocknal_data_ready);
+                sk->data_ready (sk, n);
+        } else if (!conn->ksnc_rx_ready) {        /* new news */
+                /* Set ASAP in case of concurrent calls to me */
+                conn->ksnc_rx_ready = 1;
+
+                sched = conn->ksnc_scheduler;
+
+                spin_lock_irqsave (&sched->kss_lock, flags);
+
+                /* Set again (process_receive may have cleared while I blocked for the lock) */
+                conn->ksnc_rx_ready = 1;
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&sched->kss_waitq))
+                                wake_up (&sched->kss_waitq);
+                }
+
+                spin_unlock_irqrestore (&sched->kss_lock, flags);
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        EXIT;
+}
+
+void
+ksocknal_write_space (struct sock *sk)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
+               (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->write_space != &ksocknal_write_space);
+                sk->write_space (sk);
+        } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+                clear_bit (SOCK_NOSPACE, &sk->socket->flags);
+
+                if (!conn->ksnc_tx_ready) {      /* new news */
+                        /* Set ASAP in case of concurrent calls to me */
+                        conn->ksnc_tx_ready = 1;
+
+                        sched = conn->ksnc_scheduler;
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        /* Set again (process_transmit may have
+                           cleared while I blocked for the lock) */
+                        conn->ksnc_tx_ready = 1;
+
+                        if (!conn->ksnc_tx_scheduled && // not being progressed
+                            !list_empty(&conn->ksnc_tx_queue)){//packets to send
+                                list_add_tail (&conn->ksnc_tx_list,
+                                               &sched->kss_tx_conns);
+                                conn->ksnc_tx_scheduled = 1;
+                                /* extra ref for scheduler */
+                                atomic_inc (&conn->ksnc_refcount);
+
+                                if (waitqueue_active (&sched->kss_waitq))
+                                        wake_up (&sched->kss_waitq);
+                        }
+
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ksocknal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ksocknal_data.ksnd_reaper_list)) {
+                        conn = NULL;
+                } else {
+                        conn = list_entry (ksocknal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ksocknal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq,
+                                                       ksocknal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ksocknal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t ksocknal_lib = {
+        nal_data:       &ksocknal_data,                /* NAL private data */
+        cb_send:         ksocknal_send,
+        cb_send_pages:   ksocknal_send_pages,
+        cb_recv:         ksocknal_recv,
+        cb_recv_pages:   ksocknal_recv_pages,
+        cb_read:         ksocknal_read,
+        cb_write:        ksocknal_write,
+        cb_callback:     ksocknal_callback,
+        cb_malloc:       ksocknal_malloc,
+        cb_free:         ksocknal_free,
+        cb_printf:       ksocknal_printf,
+        cb_cli:          ksocknal_cli,
+        cb_sti:          ksocknal_sti,
+        cb_dist:         ksocknal_dist
+};
diff --git a/lustre/portals/knals/toenal/Makefile.am b/lustre/portals/knals/toenal/Makefile.am
new file mode 100644 (file)
index 0000000..9bfff64
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ktoenal
+modulenet_DATA = ktoenal.o
+EXTRA_PROGRAMS = ktoenal
+
+DEFS =
+ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h
diff --git a/lustre/portals/knals/toenal/toenal.c b/lustre/portals/knals/toenal/toenal.c
new file mode 100644 (file)
index 0000000..178ea41
--- /dev/null
@@ -0,0 +1,629 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <linux/poll.h>
+#include "toenal.h"
+
+ptl_handle_ni_t         ktoenal_ni;
+static nal_t            ktoenal_api;
+static ksock_nal_data_t ktoenal_data;
+
+/*
+ksocknal_interface_t ktoenal_interface = {
+        ksni_add_sock:         ktoenal_add_sock,
+        ksni_close_sock:       ktoenal_close_sock,
+        ksni_set_mynid:                ktoenal_set_mynid,
+};
+*/
+
+kpr_nal_interface_t ktoenal_router_interface = {
+        kprni_nalid:   TOENAL,
+        kprni_arg:     &ktoenal_data,
+        kprni_fwd:     ktoenal_fwd_packet,
+};
+
+
+int
+ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */
+        return PTL_OK;
+}
+
+int
+ktoenal_api_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "closing all connections\n");
+
+        return ktoenal_close_sock(0);          /* close all sockets */
+}
+
+void
+ktoenal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ktoenal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ktoenal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ktoenal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ktoenal_data.ksnd_mynid);
+        lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ktoenal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ktoenal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ktoenal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid);
+
+        ktoenal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+int
+ktoenal_add_sock (ptl_nid_t nid, int fd)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        int                ret;
+        ENTRY;
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+        file->f_flags |= O_NONBLOCK;  /*  Does this have any conflicts */
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ktoenal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist);
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        ktoenal_data_ready(conn);
+        ktoenal_write_space(conn);
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+        /* Schedule pollthread so that it will poll
+         * for newly created socket
+         */
+
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ktoenal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0)                           /* close ALL connections */
+        {
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ktoenal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ktoenal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (list_empty (&death_row))
+                return (-ENOENT);
+
+        do {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+                ktoenal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        } while (!list_empty (&death_row));
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+        return (0);
+}
+
+
+ksock_conn_t *
+ktoenal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ktoenal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ktoenal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ktoenal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt())
+        {
+                ktoenal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list);
+        wake_up (&ktoenal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+}
+
+void
+ktoenal_free_buffers (void)
+{
+        if (ktoenal_data.ksnd_fmbs != NULL)
+        {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ktoenal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ktoenal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ktoenal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+}
+
+int
+ktoenal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ktoenal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ktoenal_set_mynid (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+
+void __exit
+ktoenal_module_fini (void)
+{
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ktoenal_data.ksnd_init)
+        {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(TOENAL);
+                PORTAL_SYMBOL_UNREGISTER (ktoenal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ktoenal_ni);
+                lib_fini(&ktoenal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ktoenal_data.ksnd_socklist));
+                LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ktoenal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ktoenal_data.ksnd_reaper_waitq);
+                wake_up_all (&ktoenal_data.ksnd_sched_waitq);
+                wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+                while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0)
+                {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ktoenal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ktoenal_data.ksnd_router);
+
+                ktoenal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+int __init
+ktoenal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ktoenal_api.forward  = ktoenal_api_forward;
+        ktoenal_api.shutdown = ktoenal_api_shutdown;
+        ktoenal_api.yield    = ktoenal_api_yield;
+        ktoenal_api.validate = NULL;           /* our api validate is a NOOP */
+        ktoenal_api.lock     = ktoenal_api_lock;
+        ktoenal_api.unlock   = ktoenal_api_unlock;
+        ktoenal_api.nal_data = &ktoenal_data;
+
+        ktoenal_lib.nal_data = &ktoenal_data;
+
+        memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist);
+        rwlock_init(&ktoenal_data.ksnd_socklist_lock);
+
+        ktoenal_data.ksnd_nal_cb = &ktoenal_lib;
+        spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init (&ktoenal_data.ksnd_sched_lock);
+
+        init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns);
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq);
+        spin_lock_init (&ktoenal_data.ksnd_reaper_lock);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_fmbs,
+                     sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        if (ktoenal_data.ksnd_fmbs == NULL)
+                RETURN(-ENOMEM);
+
+        /* NULL out buffer pointers etc */
+        memset(ktoenal_data.ksnd_fmbs, 0,
+               sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+
+        for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++)
+        {
+                ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i];
+
+                if (i < SOCKNAL_SMALL_FWD_NMSGS)
+                {
+                        fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp;
+                }
+                else
+                {
+                        fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp;
+                }
+
+                LASSERT (fmb->fmb_npages > 0);
+                for (j = 0; j < fmb->fmb_npages; j++)
+                {
+                        fmb->fmb_pages[j] = alloc_page (GFP_KERNEL);
+
+                        if (fmb->fmb_pages[j] == NULL)
+                        {
+                                ktoenal_module_fini ();
+                                return (-ENOMEM);
+                        }
+
+                        LASSERT (page_address (fmb->fmb_pages[j]) != NULL);
+                }
+
+                list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+        }
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_ltxs,
+                     sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+        if (ktoenal_data.ksnd_ltxs == NULL)
+        {
+                ktoenal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ktoenal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++)
+        {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ktoenal_data.ksnd_idle_ltx_list :
+                                &ktoenal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni);
+        if (rc != 0)
+        {
+                CERROR("ktoenal: PtlNIInit failed: error %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ktoenal_ni, ~0);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */
+
+        ktoenal_data.ksnd_slistchange = 1;
+        for (i = 0; i < TOENAL_N_SCHED; i++)
+        {
+                rc = ktoenal_thread_start (ktoenal_scheduler, NULL);
+                if (rc != 0)
+                {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc);
+                        ktoenal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ktoenal_thread_start (ktoenal_reaper, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = ktoenal_thread_start (ktoenal_pollthread, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal pollthread: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ktoenal_data.ksnd_router,
+                  &ktoenal_router_interface);
+        if (rc != 0)
+                CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc);
+
+        rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL);
+        if (rc != 0)
+                CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n",
+                       rc);
+
+        PORTAL_SYMBOL_REGISTER(ktoenal_ni);
+
+        /* flag everything initialised */
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+              kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
+               pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ktoenal_module_init);
+module_exit(ktoenal_module_fini);
+
+EXPORT_SYMBOL (ktoenal_ni);
diff --git a/lustre/portals/knals/toenal/toenal.h b/lustre/portals/knals/toenal/toenal.h
new file mode 100644 (file)
index 0000000..f793d3b
--- /dev/null
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/sched.h> 
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)        /* biggest payload I can forward */
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 32              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+#define TOENAL_N_SCHED 1
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        spinlock_t        ksnd_sched_lock;      /* serialise packet scheduling */
+        wait_queue_head_t ksnd_sched_waitq;     /* where scheduler(s) wait */
+
+        struct list_head  ksnd_rx_conns;        /* conn waiting to be read */
+        struct list_head  ksnd_tx_conns;        /* conn waiting to be written */
+        
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        
+        struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */
+        poll_table          ksnd_pwait;         /* poll wait table for the socket */
+        int                 ksnd_slistchange;   /* informs the pollthread that
+                                                 * the socklist has changed */  
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;       /* queue on conn for transmission etc */
+        char                    tx_isfwd;      /* forwarding / sourced here */
+        int                     tx_nob;        /* # packet bytes */
+        int                     tx_niov;       /* # packet frags */
+        struct iovec           *tx_iov;        /* packet frags */
+} ksock_tx_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        struct iovec            ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the addres of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* socket */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        unsigned long       ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # frags */
+        struct iovec        ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */
+
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        unsigned long       ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+        
+} ksock_conn_t;
+
+extern int ktoenal_add_sock (ptl_nid_t nid, int fd);
+extern int ktoenal_close_sock(ptl_nid_t nid);
+extern int ktoenal_set_mynid(ptl_nid_t nid);
+extern int ktoenal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid);
+extern void _ktoenal_put_conn (ksock_conn_t *conn);
+extern void ktoenal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ktoenal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ktoenal_put_conn (conn);
+}
+
+extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ktoenal_new_packet (ksock_conn_t *conn, int skip);
+extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ktoenal_scheduler (void *arg);
+extern int ktoenal_reaper (void *arg);
+extern int ktoenal_pollthread (void *arg);
+extern void ktoenal_data_ready(ksock_conn_t *conn);
+extern void ktoenal_write_space(ksock_conn_t *conn);
+
+
+extern nal_cb_t         ktoenal_lib;
+extern ksock_nal_data_t ktoenal_data;
diff --git a/lustre/portals/knals/toenal/toenal_cb.c b/lustre/portals/knals/toenal/toenal_cb.c
new file mode 100644 (file)
index 0000000..8270196
--- /dev/null
@@ -0,0 +1,1220 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *   
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/poll.h>
+#include "toenal.h"
+
+atomic_t   ktoenal_packets_received;
+long       ktoenal_packets_launched;
+long       ktoenal_packets_transmitted;
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int 
+ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL) 
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ktoenal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ktoenal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ktoenal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+       va_list ap;
+       char msg[256];
+
+       va_start (ap, fmt);
+       vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+       va_end (ap);
+
+       msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ktoenal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ktoenal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ktoenal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ktoenal_get_ltx (int may_block)
+{
+        long        flags;
+        ksock_ltx_t *ltx = NULL;
+        
+        for (;;)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+                if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list))
+                {
+                        ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block)
+                {
+                        if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list))
+                        {
+                                ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, 
+                                                  ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+                
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+                
+                wait_event (ktoenal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ktoenal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+        return (ltx);
+}
+
+int
+ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags)
+{
+        /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't)
+         */
+        mm_segment_t oldmm;
+        int           rc;
+
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        oldmm = get_fs();
+        set_fs (KERNEL_DS);
+
+#ifdef PORTAL_DEBUG
+        {
+                int total_nob;
+                int i;
+                
+                for (i = total_nob = 0; i < niov; i++)
+                        total_nob += iov[i].iov_len;
+                
+                LASSERT (nob == total_nob);
+        }
+#endif        
+        LASSERT (!in_interrupt());
+       
+        rc = sock->f_op->writev(sock, iov, niov, NULL);
+
+        set_fs (oldmm);
+
+        if (rc > 0)                             /* sent something? */
+        {
+                nob = rc;                       /* consume iov */
+                for (;;)
+                {
+                        LASSERT (niov > 0);
+                        
+                        if (iov->iov_len >= nob)
+                        {
+                                iov->iov_len -= nob;
+                                iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob);
+                                break;
+                        }
+                        nob -= iov->iov_len;
+                        iov->iov_len = 0;
+                        iov++;
+                        niov--;
+                }
+        }
+
+        return (rc);
+}
+
+int
+ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread)
+{
+        /* NB This procedure "consumes" iov (actually tcp_recvmsg does)
+         */
+        mm_segment_t oldmm;
+        int ret, i, len = 0, origlen = 0;
+        
+        PROF_START(our_recvmsg);
+        for(i = 0; i < niov; i++) {
+                len += iov[i].iov_len;
+                if(len >= toread)
+                        break;
+        }
+
+        if(len >= toread) {
+                origlen = iov[i].iov_len;
+                iov[i].iov_len -= (len - toread);
+        }
+        else {  /* i == niov */
+                i = niov - 1;
+        }
+
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+
+        ret = sock->f_op->readv(sock, iov, i + 1, NULL);
+        
+        set_fs(oldmm);
+
+        if(origlen)
+                iov[i].iov_len = origlen;
+
+        PROF_FINISH(our_recvmsg);
+        return ret;
+}
+
+void
+ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        int         rc;
+        
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+
+        /* assume transmit will complete now, so dequeue while I've got the lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;                /* write_space may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to write */
+
+        rc = ktoenal_sendmsg (conn->ksnc_file,
+                               tx->tx_iov, tx->tx_niov, tx->tx_nob,
+                               list_empty (&conn->ksnc_tx_queue) ? 
+                               MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE));
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc < 0)                             /* error */
+        {
+                if (rc == -EAGAIN)              /* socket full => */
+                        rc = 0;                 /* nothing sent */
+                else
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                        rc = tx->tx_nob;        /* kid on for now whole packet went */
+                }
+        }
+
+        if (rc == tx->tx_nob)                   /* everything went */
+        {
+                conn->ksnc_tx_ready = 1;        /* assume more can go (ASAP) */
+                ktoenal_put_conn (conn);       /* release packet's ref */
+
+                if (tx->tx_isfwd)               /* was a forwarded packet? */
+                {
+                        kpr_fwd_done (&ktoenal_data.ksnd_router,
+                                      KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                }
+                else                            /* local send */
+                {
+                        ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+                        lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                        
+                        list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+                        /* normal tx desc => wakeup anyone blocking for one */
+                        if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list &&
+                            waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq))
+                                wake_up (&ktoenal_data.ksnd_idle_ltx_waitq);
+                }
+                ktoenal_packets_transmitted++;
+        }
+        else
+        {
+                tx->tx_nob -= rc;
+
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue))  /* nothing to write */
+        {
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+}
+
+void
+ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        long          flags;
+        int           nob = tx->tx_nob;
+        struct iovec *iov = tx->tx_iov;
+        int           niov = 1;
+        
+        LASSERT (nob >= sizeof (ptl_hdr_t));
+
+        /* Truncate iov to exactly match total packet length
+         * since socket sendmsg pays no attention to requested length.
+         */
+        for (;;)
+        {
+                LASSERT (niov <= tx->tx_niov);
+                LASSERT (iov->iov_len >= 0);
+                
+                if (iov->iov_len >= nob)
+                {
+                        iov->iov_len = nob;
+                        break;
+                }
+                nob -= iov->iov_len;
+                iov++;
+                niov++;
+        }
+        tx->tx_niov = niov;
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled)           /* not scheduled to send */
+        {
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        ktoenal_packets_launched++;
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+int
+ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie,
+              ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+              unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        ksock_ltx_t  *ltx;
+        int           rc;
+        int           i;
+
+        /* By this point, as it happens, we have absolutely no idea what
+         * 'private' is.  It might be ksock_nal_data or it might be ksock_conn.
+         * Ha ha, isn't that a funny joke?
+         *
+         * FIXME: this is not the right way to fix this; the right way is to
+         * always pass in the same kind of structure.  This is hard right now.
+         * To revisit this issue, set a breakpoint in here and watch for when
+         * it's called from lib_finalize.  I think this occurs when we send a
+         * packet as a side-effect of another packet, such as when an ACK has
+         * been requested. -phil */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes from [%d](%p,%d)... to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov,
+               payload_niov > 0 ? payload_iov[0].iov_base : NULL,
+               payload_niov > 0 ? payload_iov[0].iov_len  : 0,
+               nid, pid);
+
+        if ((conn = ktoenal_get_conn (nid)) == NULL)
+        {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0)
+                {
+                        CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
+                        return (-1);
+                }
+
+                if ((conn = ktoenal_get_conn (gatewaynid)) == NULL)
+                {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", 
+                                nid, gatewaynid);
+                        return (-1);
+                }
+        }
+
+        /* This transmit has now got a ref on conn */
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK ||
+                                 type == PTL_MSG_REPLY ||
+                                 in_interrupt ()));
+        if (ltx == NULL)
+        {
+                CERROR ("Can't allocate tx desc\n");
+                ktoenal_put_conn (conn);
+                return (-1);
+        }
+        
+        /* Init common (to sends and forwards) packet part */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_iov = ltx->ltx_iov;
+
+        /* Init local send packet (storage for hdr, finalize() args, iov) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+
+        ltx->ltx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        for (i = 0; i < payload_niov; i++)
+        {
+                ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base;
+                ltx->ltx_iov[1 + i].iov_len  = payload_iov[i].iov_len;
+        }
+
+        ktoenal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, 
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        if (nid == ktoenal_lib.ni.nid)         /* I'm the gateway; must be the last hop */
+                nid = fwd->kprfd_target_nid;
+        
+        conn = ktoenal_get_conn (nid);
+        if (conn == NULL)
+        {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                       /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+
+        ktoenal_launch_packet (conn, tx);
+}
+
+int
+ktoenal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ktoenal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ktoenal_thread_fini (void)
+{
+        atomic_dec (&ktoenal_data.ksnd_nthreads);
+}
+
+void
+ktoenal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn;
+        long               flags;
+
+        CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", 
+                hdr->src_nid, hdr->dest_nid, error);
+
+        if (error != 0)
+                CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", 
+                        hdr->src_nid, hdr->dest_nid, error);
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+        list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+
+        if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns))
+        {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+
+                CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+                LASSERT (conn->ksnc_rx_scheduled);
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+                conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+ksock_fmb_t *
+ktoenal_get_idle_fmb (ksock_conn_t *conn)
+{
+        /* NB called with sched lock held */
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+        
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ktoenal_data.ksnd_small_fmp;
+        else
+                pool = &ktoenal_data.ksnd_large_fmp;
+        
+        if (!list_empty (&pool->fmp_idle_fmbs))
+        {
+                fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+        return (NULL);
+}
+
+
+int
+ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        int niov;                               /* at least the header */
+        int nob;
+        
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+        
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+                
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */
+
+        if (payload_nob == 0)                   /* got complete packet already */
+        {
+                atomic_inc (&ktoenal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                              packet_nob, 1, fmb->fmb_iov, 
+                              ktoenal_fmb_callback, fmb);
+
+                kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE)            /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        else
+        {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+                
+                do
+                {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                      packet_nob, niov, fmb->fmb_iov, 
+                      ktoenal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */        
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ktoenal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        int           body_len;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        switch (conn->ksnc_hdr.type)
+        {
+        case PTL_MSG_GET:
+        case PTL_MSG_ACK:
+                body_len = 0;
+                break;
+        case PTL_MSG_PUT:
+                body_len = conn->ksnc_hdr.msg.put.length;
+                break;
+        case PTL_MSG_REPLY:
+                body_len = conn->ksnc_hdr.msg.reply.length;
+                break;
+        default:
+                /* Unrecognised packet type */
+                CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n",
+                        conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                /* Ignore this header and go back to reading a new packet. */
+                ktoenal_new_packet (conn, 0);
+                return;
+        }
+
+        if (body_len < 0)                               /* length corrupt */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD)         /* too big to forward */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, body_len);    /* on to new packet (skip this one's body) */
+                return;
+        }
+
+        conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */
+        if (conn2 != NULL)
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                ktoenal_put_conn (conn2);          /* drop ref from get above */
+
+                ktoenal_new_packet (conn, body_len);  /* on to next packet (skip this one's body) */
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ktoenal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0)                   /* right at next packet boundary now */
+        {
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+                return (1);
+        }
+
+        /* set up to skip as much a possible now */
+        /* if there's more left (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        skipped = 0;
+        niov = 0;
+
+        do
+        {
+                nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&            /* mustn't overflow conn's rx iov */
+                 niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_fmb_t *fmb;
+        int          len;
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* NB: sched lock held */
+        CDEBUG(D_NET, "conn %p\n", conn);
+
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)     /* doesn't need a forwarding buffer */
+        {
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                goto try_read;
+        }
+
+ get_fmb:
+        /* NB: sched lock held */
+        fmb = ktoenal_get_idle_fmb (conn);
+        if (fmb == NULL)                        /* conn descheduled waiting for idle fmb */
+                return;
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+        
+        if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;                /* data ready may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to read */
+
+        /* NB ktoenal_recvmsg "consumes" the iov passed to it */
+        len = ktoenal_recvmsg(conn->ksnc_file,
+                               conn->ksnc_rx_iov, conn->ksnc_rx_niov,
+                               conn->ksnc_rx_nob_wanted);
+        CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len);
+
+        if (len <= 0)                           /* nothing ready (EAGAIN) or EOF or error */
+        {
+                if (len != -EAGAIN &&           /* ! nothing to read now */
+                    len != 0)                   /* ! nothing to read ever */
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal read(%d) %p: %d\n",
+                                conn->ksnc_rx_nob_wanted, conn, len);
+                }
+                goto out;                       /* come back when there's data ready */
+        }
+
+        LASSERT (len <= conn->ksnc_rx_nob_wanted);
+        conn->ksnc_rx_nob_wanted -= len;
+        conn->ksnc_rx_nob_left -= len;
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        conn->ksnc_rx_ready = 1;                /* assume there's more to be had */
+
+        switch (conn->ksnc_rx_state)
+        {
+        case SOCKNAL_RX_HEADER:
+                if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */
+                {
+                        ktoenal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state)
+                        {
+                        case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping this packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                        }
+                        /* Not Reached */
+                        LBUG ();
+                }
+
+                PROF_START(lib_parse);
+                lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */
+                {
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ktoenal_packets_received);
+                lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */
+                        goto out;               /* come back later */
+                goto try_read;                  /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ktoenal_packets_received);
+
+                /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */
+                kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        if (!conn->ksnc_rx_ready)               /* no data there to read? */
+        {
+                conn->ksnc_rx_scheduled = 0;    /* let socket callback schedule again */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+}
+
+int
+ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg,
+             unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+        int           i;
+
+        conn->ksnc_cookie = msg;
+
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        for (i = 0; i < niov; i++)
+        {
+                conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len;
+                conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base;
+        }
+
+        conn->ksnc_rx_niov       = niov;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        return (rlen);
+}
+
+int
+ktoenal_scheduler (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        int                nloops = 0;
+
+        kportal_daemonize ("ktoenal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&ktoenal_data.ksnd_rx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_rx_conns.next,
+                                           ksock_conn_t, ksnc_rx_list);
+                        list_del (&conn->ksnc_rx_list);
+
+                        ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */
+                }
+
+                if (!list_empty (&ktoenal_data.ksnd_tx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_tx_conns.next,
+                                           ksock_conn_t, ksnc_tx_list);
+
+                        list_del (&conn->ksnc_tx_list);
+                        ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */
+                }
+
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */
+                {
+                        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+                                rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq,
+                                                               ktoenal_data.ksnd_shuttingdown ||
+                                                               !list_empty (&ktoenal_data.ksnd_rx_conns) ||
+                                                               !list_empty (&ktoenal_data.ksnd_tx_conns));
+                                LASSERT (rc == 0);
+                        } else 
+                                our_cond_resched();
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+
+int
+ktoenal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ktoenal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ktoenal_data.ksnd_reaper_list))
+                        conn = NULL;
+                else
+                {
+                        conn = list_entry (ktoenal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ktoenal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq,
+                                                       ktoenal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ktoenal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+#define POLLREAD        (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)
+#define POLLWRITE       (POLLOUT | POLLWRNORM | POLLWRBAND)
+
+int
+ktoenal_pollthread(void *arg)
+{
+        unsigned int mask;
+        struct list_head *tmp;
+        ksock_conn_t *conn;
+        
+        /* Save the task struct for waking it up */
+        ktoenal_data.ksnd_pollthread_tsk = current; 
+        
+        kportal_daemonize ("ktoenal_pollthread");
+        kportal_blockallsigs ();
+        
+        poll_initwait(&ktoenal_data.ksnd_pwait);
+        
+        while(!ktoenal_data.ksnd_shuttingdown) {
+                
+                set_current_state(TASK_INTERRUPTIBLE);
+                
+                read_lock (&ktoenal_data.ksnd_socklist_lock);
+                list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+                        
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        atomic_inc(&conn->ksnc_refcount);
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                        
+                        mask = conn->ksnc_file->f_op->poll(conn->ksnc_file,
+                                  ktoenal_data.ksnd_slistchange ? 
+                                  &ktoenal_data.ksnd_pwait : NULL);
+                         
+                        if(mask & POLLREAD) {
+                                ktoenal_data_ready(conn);
+                                                        
+                        } 
+                        if (mask & POLLWRITE) {
+                                ktoenal_write_space(conn);  
+                              
+                        }
+                        if (mask & (POLLERR | POLLHUP)) {
+                                         /* Do error processing */          
+                        }      
+                        
+                        read_lock (&ktoenal_data.ksnd_socklist_lock);
+                        if(atomic_dec_and_test(&conn->ksnc_refcount))
+                                _ktoenal_put_conn(conn);
+                }
+                ktoenal_data.ksnd_slistchange = 0;
+                read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                
+                schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+                if(ktoenal_data.ksnd_slistchange) {
+                        poll_freewait(&ktoenal_data.ksnd_pwait); 
+                        poll_initwait(&ktoenal_data.ksnd_pwait);
+                }
+         }
+        poll_freewait(&ktoenal_data.ksnd_pwait);
+        ktoenal_thread_fini();
+        return (0);
+}
+
+void
+ktoenal_data_ready (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+        ENTRY;
+
+        if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { 
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail (&conn->ksnc_rx_list, 
+                                        &ktoenal_data.ksnd_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        /* This is done to avoid the effects of a sequence
+                         * of events in which the rx_ready is lost
+                         */
+                        conn->ksnc_rx_ready=1;
+                          
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+
+        EXIT;
+}
+
+void
+ktoenal_write_space (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+
+        CDEBUG (D_NET, "conn %p%s%s%s\n",
+                         conn,
+                        (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"),
+                        (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"),
+                        (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued"));
+
+
+        if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */
+                                !conn->ksnc_tx_scheduled) { /* not being progressed */
+
+                        list_add_tail (&conn->ksnc_tx_list, 
+                                        &ktoenal_data.ksnd_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+}
+
+nal_cb_t ktoenal_lib = {
+        nal_data:       &ktoenal_data,                /* NAL private data */
+        cb_send:         ktoenal_send,
+        cb_recv:         ktoenal_recv,
+        cb_read:         ktoenal_read,
+        cb_write:        ktoenal_write,
+        cb_callback:     ktoenal_callback,
+        cb_malloc:       ktoenal_malloc,
+        cb_free:         ktoenal_free,
+        cb_printf:       ktoenal_printf,
+        cb_cli:          ktoenal_cli,
+        cb_sti:          ktoenal_sti,
+        cb_dist:         ktoenal_dist
+};
diff --git a/lustre/portals/libcfs/Makefile.am b/lustre/portals/libcfs/Makefile.am
new file mode 100644 (file)
index 0000000..e2e11af
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+MODULE = portals
+modulenet_DATA = portals.o
+EXTRA_PROGRAMS = portals
+
+LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-not-impl.c lib-pid.c
+APILINKS := api-eq.c api-errno.c api-init.c api-md.c api-me.c api-ni.c api-wrap.c
+LINKS = $(APILINKS) $(LIBLINKS) 
+DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej
+
+$(LINKS): link-stamp
+link-stamp:
+       -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       echo timestamp > link-stamp
+
+DEFS =
+portals_SOURCES = $(LINKS) module.c proc.c debug.c
+
+# Don't distribute any patched files.
+dist-hook:
+       list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done
+
+include ../Rules.linux
diff --git a/lustre/portals/libcfs/Makefile.mk b/lustre/portals/libcfs/Makefile.mk
new file mode 100644 (file)
index 0000000..3196ea2
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += libcfs.o
+licfs-objs    := module.o proc.o debug.o
\ No newline at end of file
diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c
new file mode 100644 (file)
index 0000000..6233b8d
--- /dev/null
@@ -0,0 +1,821 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+
+#define DEBUG_OVERFLOW 1024
+static char *debug_buf = NULL;
+static unsigned long debug_size = 0;
+static atomic_t debug_off_a = ATOMIC_INIT(0);
+static int debug_wrapped;
+wait_queue_head_t debug_ctlwq;
+#define DAEMON_SND_SIZE      (64 << 10)
+
+/*
+ * used by the daemon to keep track the offset into debug_buffer for the next
+ * write to the file.  Usually, the daemon is to write out buffer
+ * from debug_daemon_next_write upto debug_off
+ *  variable usage
+ *      Reader - portals_debug_msg()
+ *      Writer - portals_debug_daemon()
+ *               portals_debug_daemon_start() during daemon init time
+ *               portals_debug_daemon_continue() to reset to debug_off
+ *               portals_debug_clear_buffer() reset to debug_off for clear
+ *      Note that *_start(), *_continue() & *clear_buffer() should serialized;
+ */
+static atomic_t   debug_daemon_next_write;
+
+/*
+ * A debug_daemon can be in following states
+ *      stopped - stopped state means there is no debug_daemon running.
+ *                accordingly, it must be in paused state
+ *                a daemon is in !stopped && !paused state after
+ *                "lctl debug_daemon start" creates debug_daemon successfully
+ *                Variable Usage
+ *                      Reader - portals_debug_daemon()
+ *                               portals_debug_set_daemon() routines
+ *                      Writer - portals_debug_set_daemon() routines
+ *                              portals_debug_daemon() on IO error
+ *      paused -  a debug_daemon state is changed from !paused into paused
+ *                when "lctl debug_daemon paused" is issued
+ *                "lctl debug_daemon continue" gets a daemon into !paused mode
+ *                      Reader - portals_debug_set_daemon() routines
+ *                               portals_debug_msg()
+ *                      Writer - portals_debug_set_daemon() on init
+ *                               portals_debug_daemon()
+ *
+ *        Daemon  state diagram.
+ *                      (stopped, paused)
+ *                              |  <-- debug_daemon start
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon pause
+ *                              V
+ *                      (!stopped, paused)
+ *                              |  <-- debug_daemon continue
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon stop
+ *                              V
+ *                      (stopped, paused)
+ *      Overlapped - this is a state when CDEBUG is too fast for the daemon to
+ *                   write out the debug_bufferr.  That is, debug_off is to
+ *                   overlap debug_daemon_next_write;
+ *                     Reader - portals_debug_msg()
+ *                     Writer - portals_debug_msg()
+ */
+
+/*
+ * Description on Trace Daemon Synchronization
+ *
+ * Three categories of code are synchronizing between each other
+ * 1.   lctl, portals_debug_set_daemon(), the user debug control code, 
+ *      as well as portals_debug_clear_buffer()
+ * 2.   CDEBUG, portals_debug_msg(), the debug put messages routine
+ * 3.   Daemon, portals_debug_daemon(), to write out debug log file
+ *
+ *
+ * Three different controls for synchronizations
+ *
+ * 1.   debug_daemon_semaphore
+ *      The usage of this semaphore is to serialize multiple lctl controls 
+ *      in manipulating debug daemon state.  The semaphore serves as the 
+ *      gatekeeper to allow only one user control thread, at any giving time, 
+ *      to access debug daemon state and keeps the other user control requests 
+ *      in wait state until the current control request is serviced.
+ *
+ * 2.   wait_queue_head_t lctl (paired with lctl_event flag)
+ *      Lctl event is the event between portals_debug_set_daemon() and 
+ *      portals_debug_daemon().  Lctl is an indicator for portals_debug_daemon()
+ *      to flush data out to file.  portals_debug_daemon() is to use lctl event
+ *      as signal channel to wakeup portals_debug_set_daemon() upon flush 
+ *      operation is done.
+ *
+ *      Producer :
+ *              portals_debug_daemon() uses to wake up 
+ *              portals_debug_set_daemon(), pause and stop, routines
+ *      Consumer :
+ *              portals_debug_set_daemon(), stop and pause operations, 
+ *              wait and sleep on the event
+ *
+ * 3.   wait_queue_head_t daemon (paired with daemon_event flag)
+ *      This is an event channel to wakeup portals_debug_daemon.  Daemon 
+ *      wakes up to run whenever there is an event posted.   Daemon handles 
+ *      2 types of operations . 1. Writes data out to debug file, 2. Flushes 
+ *      file and terminates base on lctl event. 
+ *      File operation -
+ *              Daemon is normally in a sleep state.  
+ *              Daemon is woken up through daemon event whenever CDEBUG is 
+ *              putting data over any 64K boundary. 
+ *      File flush and termination -
+ *              On portals_debug_daemon_stop/pause() operations, lctl control 
+ *              is to wake up daemon through daemon event.
+ *
+ *      We can't use sleep_on() and wake_up() to replace daemon event because 
+ *      portals_debug_daemon() must catch the wakeup operation posted by 
+ *      portals_debug_daemon_stop/pause().  Otherwise, stop and pause may 
+ *      stuck in lctl wait event.
+ *
+ *      Producer :
+ *           a. portals_debug_daemon_pause() and portals_debug_daemon_stop() 
+ *              uses the event to wake up portals_debug_daemon()
+ *           b. portals_debug_msg() uses the event to wake up 
+ *              portals_debug_daemon() whenever the data output is acrossing 
+ *              a 64K bytes boundary.
+ *      Consumer :
+ *              portals_debug_daemon() wakes up upon daemon event.
+ *
+ * Sequence for portals_debug_daemon_stop() operation
+ *
+ * _Portals_debug_daemon_stop()_          _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      Paused = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Set force_flush flag if lctlevnt
+ *                                      Flush data
+ *                                      Wakeup_event (lctl)
+ *                                      Wait_event(daemon)
+ *      Stopped = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Exit daemon loop if (Stopped)
+ *                                      Wakeup_event (lctl)
+ *                                      Exit
+ *      Return to user application
+ *
+ *
+ * _Portals_debug_msg()_                  _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      If (WriteStart<64K<WriteEnd)
+ *         Wakeup_event(daemon)
+ *                                      Do file IO
+ *                                      Wait_event(daemon)
+ */
+struct debug_daemon_state {
+        unsigned long overlapped;
+        unsigned long stopped;
+        atomic_t paused;
+        unsigned long   lctl_event;     /* event for lctl */
+        wait_queue_head_t lctl;
+        unsigned long   daemon_event;   /* event for daemon */
+        wait_queue_head_t daemon;
+};
+static struct debug_daemon_state debug_daemon_state;
+static DECLARE_MUTEX(debug_daemon_semaphore);
+
+static loff_t daemon_file_size_limit;
+char debug_daemon_file_path[1024] = "";
+
+spinlock_t portals_debug_lock = SPIN_LOCK_UNLOCKED;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+int handled_panic; /* to avoid recursive calls to notifiers */
+char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall";
+
+
+int portals_do_debug_dumplog(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        int rc;
+        mm_segment_t oldfs;
+        unsigned long debug_off;
+
+        kportal_daemonize("");
+
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+        sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME);
+        file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for dumping", debug_file_name);
+                GOTO(out, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+                       debug_file_name);
+        }
+
+        debug_off = atomic_read(&debug_off_a);
+        oldfs = get_fs();
+        set_fs(get_ds());
+        if (debug_wrapped) {
+                rc = file->f_op->write(file, debug_buf + debug_off + 1,
+                                       debug_size-debug_off-1, &file->f_pos);
+                rc += file->f_op->write(file, debug_buf, debug_off + 1,
+                                        &file->f_pos);
+        } else {
+                rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
+        }
+        printk("wrote %d bytes\n", rc);
+        set_fs(oldfs);
+
+        rc = file->f_op->fsync(file, file->f_dentry, 1);
+        if (rc)
+                CERROR("sync returns %d\n", rc);
+        filp_close(file, 0);
+out:
+        current->journal_info = journal_info;
+        wake_up(&debug_ctlwq);
+        return 0;
+}
+
+int portals_debug_daemon(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        mm_segment_t oldfs;
+        unsigned long force_flush = 0;
+        unsigned long size;
+        int rc;
+
+        kportal_daemonize("ldebug_daemon");
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+
+        file = filp_open(debug_daemon_file_path,
+                         O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for logging", debug_daemon_file_path);
+                GOTO(out1, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+                       debug_daemon_file_path);
+        }
+
+        debug_daemon_state.overlapped = 0;
+        debug_daemon_state.stopped = 0;
+        atomic_set(&debug_daemon_state.paused, 0);
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        while (1) {
+                unsigned long ending;
+                unsigned long start, tail;
+                long delta;
+
+                debug_daemon_state.daemon_event = 0;
+
+                ending = atomic_read(&debug_off_a);
+                start = atomic_read(&debug_daemon_next_write);
+
+                /* check if paused is imposed by lctl ? */
+                force_flush = !debug_daemon_state.lctl_event;
+
+                delta = ending - start;
+                tail = debug_size - start;
+                size = (delta >= 0) ? delta : tail;
+                while (size && (force_flush || (delta < 0) ||
+                                (size >= DAEMON_SND_SIZE))) {
+                        if (daemon_file_size_limit) {
+                               int ssize = daemon_file_size_limit - file->f_pos;
+                               if (size > ssize)
+                                        size = ssize;
+                        }
+
+                        rc = file->f_op->write(file, debug_buf+start,
+                                               size, &file->f_pos);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                           "Debug_daemon write error %d\n", rc);
+                                goto out;
+                        }
+                        start += rc;
+                        delta = ending - start;
+                        tail = debug_size - start;
+                        if (tail == 0)
+                                start = 0;
+                        if (delta >= 0)
+                                size = delta;
+                        else
+                                size = (tail == 0) ? ending : tail;
+                        if (daemon_file_size_limit == file->f_pos) {
+                                // file wrapped around
+                                file->f_pos = 0;
+                        }
+                }
+                atomic_set(&debug_daemon_next_write, start);
+                if (force_flush) {
+                        rc = file->f_op->fsync(file, file->f_dentry, 1);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                       "Debug_daemon sync error %d\n", rc);
+                                goto out;
+                        }
+                        if (debug_daemon_state.stopped)
+                               break;           
+                        debug_daemon_state.lctl_event = 1;
+                        wake_up(&debug_daemon_state.lctl);
+                }
+                wait_event(debug_daemon_state.daemon, 
+                           debug_daemon_state.daemon_event);
+                }
+out:
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+        set_fs(oldfs);
+        filp_close(file, 0);
+        current->journal_info = journal_info;
+out1:
+        debug_daemon_state.lctl_event = 1;
+        wake_up(&debug_daemon_state.lctl);
+        return 0;
+}
+
+void portals_debug_print(void)
+{
+        unsigned long dumplen = 64 * 1024;
+        char *start1, *start2;
+        char *end1, *end2;
+        unsigned long debug_off = atomic_read(&debug_off_a);
+
+        start1 = debug_buf + debug_off - dumplen;
+        if (start1 < debug_buf) {
+                start1 += debug_size;
+                end1 = debug_buf + debug_size - 1;
+                start2 = debug_buf;
+                end2 = debug_buf + debug_off;
+        } else {
+                end1 = debug_buf + debug_off;
+                start2 = debug_buf + debug_off;
+                end2 = debug_buf + debug_off;
+        }
+
+        while (start1 < end1) {
+                int count = MIN(1024, end1 - start1);
+                printk("%*s", count, start1);
+                start1 += 1024;
+        }
+        while (start2 < end2) {
+                int count = MIN(1024, end2 - start2);
+                printk("%*s", count, start2);
+                start2 += 1024;
+        }
+}
+
+void portals_debug_dumplog(void)
+{
+        int rc;
+        ENTRY;
+
+        init_waitqueue_head(&debug_ctlwq);
+
+        rc = kernel_thread(portals_do_debug_dumplog,
+                           NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start dump thread\n");
+                return;
+        }
+        sleep_on(&debug_ctlwq);
+}
+
+int portals_debug_daemon_start(char *file, unsigned int size)
+{
+        int rc;
+
+        if (!debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (file != NULL)
+                strncpy(debug_daemon_file_path, file, 1024);
+
+        init_waitqueue_head(&debug_daemon_state.lctl);
+        init_waitqueue_head(&debug_daemon_state.daemon);
+
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+
+        daemon_file_size_limit = size << 20;
+
+        debug_daemon_state.lctl_event = 0;
+        rc = kernel_thread(portals_debug_daemon, NULL, 0);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start debug daemon thread\n");
+                strncpy(debug_daemon_file_path, "\0", 1);
+                return rc;
+        }
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_pause(void)
+{
+        if (atomic_read(&debug_daemon_state.paused))
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_continue(void)
+{
+        if (!atomic_read(&debug_daemon_state.paused))
+                return -EINVAL;
+        if (debug_daemon_state.stopped)
+                return -EINVAL;
+
+        debug_daemon_state.overlapped = 0;
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+        atomic_set(&debug_daemon_state.paused, 0);
+        return 0;
+}
+
+int portals_debug_daemon_stop(void)
+{
+        if (debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (!atomic_read(&debug_daemon_state.paused))
+                portals_debug_daemon_pause();
+
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.stopped = 1;
+
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+
+        debug_daemon_file_path[0] = '\0';
+        return 0;
+}
+
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                             char *filename, unsigned int size)
+{
+        int rc = -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        switch (cmd) {
+                case DEBUG_DAEMON_START:
+                        if (length && (filename[length -1] != '\0')) {
+                                CERROR("Invalid filename for debug_daemon\n");
+                                rc = -EINVAL;
+                                break;
+                        }
+                        rc = portals_debug_daemon_start(filename, size);
+                        break;
+                case DEBUG_DAEMON_STOP:
+                        rc = portals_debug_daemon_stop();
+                        break;
+                case DEBUG_DAEMON_PAUSE:
+                        rc = portals_debug_daemon_pause();
+                        break;
+                case DEBUG_DAEMON_CONTINUE:
+                        rc = portals_debug_daemon_continue();
+                        break;
+                default:
+                        CERROR("unknown set_daemon cmd\n");
+        }
+        up(&debug_daemon_semaphore);
+        return rc;
+}
+
+static int panic_dumplog(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (handled_panic)
+                return 0;
+        else
+                handled_panic = 1;
+
+        if (in_interrupt()) {
+                portals_debug_print();
+                return 0;
+        }
+
+        while (current->lock_depth >= 0)
+                unlock_kernel();
+        portals_debug_dumplog();
+        return 0;
+}
+
+static struct notifier_block lustre_panic_notifier = {
+        notifier_call :     panic_dumplog,
+        next :              NULL,
+        priority :          10000
+};
+
+int portals_debug_init(unsigned long bufsize)
+{
+        unsigned long debug_off = atomic_read(&debug_off_a);
+        if (debug_buf != NULL)
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+
+        debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW);
+        if (debug_buf == NULL)
+                return -ENOMEM;
+        memset(debug_buf, 0, debug_size);
+        debug_wrapped = 0;
+
+        printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n",
+               bufsize, debug_buf);
+        atomic_set(&debug_off_a, debug_off);
+        notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier);
+        debug_size = bufsize;
+
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier);
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        portals_debug_daemon_stop();
+
+        vfree(debug_buf);
+        atomic_set(&debug_off_a, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+int portals_debug_clear_buffer(void)
+{
+        unsigned long flags;
+        unsigned long state;
+
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        state = atomic_read(&debug_daemon_state.paused);
+        if (!state)
+                portals_debug_daemon_pause();
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        atomic_set(&debug_off_a, 0);
+        debug_wrapped = 0;
+        atomic_set(&debug_daemon_next_write, 0);
+        debug_daemon_state.overlapped = 0;
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        if (!state)
+                atomic_set(&debug_daemon_state.paused, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+/* Debug markers, although printed by S_PORTALS
+ * should not be be marked as such.
+ */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int portals_debug_mark_buffer(char *text)
+{
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        CDEBUG(0, "*******************************************************************************\n");
+        CDEBUG(0, "DEBUG MARKER: %s\n", text);
+        CDEBUG(0, "*******************************************************************************\n");
+
+        return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        int rc;
+        unsigned long debug_off;
+        unsigned long flags;
+
+        if (len < debug_size)
+                return -ENOSPC;
+
+        debug_off = atomic_read(&debug_off_a);
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        if (debug_wrapped) {
+                /* All of this juggling with the 1s is to keep the trailing nul
+                 * (which falls at debug_buf + debug_off) at the end of what we
+                 * copy into user space */
+                copy_to_user(buf, debug_buf + debug_off + 1,
+                             debug_size - debug_off - 1);
+                copy_to_user(buf + debug_size - debug_off - 1,
+                             debug_buf, debug_off + 1);
+                rc = debug_size;
+        } else {
+                copy_to_user(buf, debug_buf, debug_off);
+                rc = debug_off;
+        }
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        return rc;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   unsigned long stack, const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        int           max_nob;
+        int           prefix_nob;
+        int           msg_nob;
+        struct timeval tv;
+        unsigned long base_offset;
+        unsigned long debug_off;
+
+        if (debug_buf == NULL) {
+                printk("portals_debug_msg: debug_buf is NULL!\n");
+                return;
+        }
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        debug_off = atomic_read(&debug_off_a);
+        if (!atomic_read(&debug_daemon_state.paused)) {
+                unsigned long available;
+                long delta;
+                long v = atomic_read(&debug_daemon_next_write);
+
+                delta = debug_off - v;
+                available = (delta>=0) ? debug_size-delta : -delta;
+                // Check if we still have enough debug buffer for CDEBUG
+                if (available < DAEMON_SND_SIZE) {
+                        /* Drop CDEBUG packets until enough debug_buffer is
+                         * available */
+                        if (debug_daemon_state.overlapped)
+                                 goto out;
+                        /* If this is the first time, leave a marker in the
+                         * output */
+                        debug_daemon_state.overlapped = 1;
+                        ap = NULL;
+                        format = "DEBUG MARKER: Debug buffer overlapped\n";
+                } else  /* More space just became available */
+                        debug_daemon_state.overlapped = 0;
+        }
+
+        max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
+        if (max_nob <= 0) {
+                spin_unlock_irqrestore(&portals_debug_lock, flags);
+                printk("logic error in portals_debug_msg: <0 bytes to write\n");
+                return;
+        }
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        do_gettimeofday(&tv);
+
+        prefix_nob = snprintf(debug_buf + debug_off, max_nob,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id(),
+                              tv.tv_sec, tv.tv_usec);
+        max_nob -= prefix_nob;
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.extern_pid, stack);
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.mode.tt.extern_pid, stack);
+#else
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d+%lu): ",
+                           file, line, fn, current->pid, stack);
+#endif
+        max_nob -= msg_nob;
+
+        va_start(ap, format);
+        msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob,
+                            max_nob, format, ap);
+        max_nob -= msg_nob;
+        va_end(ap);
+
+        /* Print to console, while msg is contiguous in debug_buf */
+        /* NB safely terminated see above */
+        if ((mask & D_EMERG) != 0)
+                printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+        if ((mask & D_ERROR) != 0)
+                printk(KERN_ERR   "%s", debug_buf + debug_off + prefix_nob);
+        else if (portal_printk)
+                printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+        base_offset = debug_off & 0xFFFF;
+
+        debug_off += prefix_nob + msg_nob;
+        if (debug_off > debug_size) {
+                memcpy(debug_buf, debug_buf + debug_size,
+                       debug_off - debug_size + 1);
+                debug_off -= debug_size;
+                debug_wrapped = 1;
+        }
+
+        atomic_set(&debug_off_a, debug_off);
+        if (!atomic_read(&debug_daemon_state.paused) &&
+            ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) {
+                debug_daemon_state.daemon_event = 1;
+                wake_up(&debug_daemon_state.daemon);
+        }
+out:
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+}
+
+void portals_debug_set_level(unsigned int debug_level)
+{
+        printk("Setting portals debug level to %08x\n", debug_level);
+        portal_debug = debug_level;
+}
+
+void portals_run_lbug_upcall(char * file, char *fn, int line)
+{
+        char *argv[6];
+        char *envp[3];
+        char buf[32];
+        int rc;
+
+        ENTRY;
+        snprintf (buf, sizeof buf, "%d", line);
+
+        argv[0] = portals_upcall;
+        argv[1] = "LBUG";
+        argv[2] = file;
+        argv[3] = fn;
+        argv[4] = buf;
+        argv[5] = NULL;
+
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+
+        rc = call_usermodehelper(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
+                       "/proc/sys/portals/upcall\n",                
+                       argv[0], argv[1], argv[2], argv[3], argv[4], rc);
+                
+        } else {
+                CERROR("Invoked upcall %s %s %s %s %s\n",
+                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+        }
+}
+
+
+EXPORT_SYMBOL(portals_debug_dumplog);
+EXPORT_SYMBOL(portals_debug_msg);
+EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_lbug_upcall);
diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c
new file mode 100644 (file)
index 0000000..1b9e5bb
--- /dev/null
@@ -0,0 +1,572 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+#include <portals/lib-p30.h>
+#include <portals/p30.h>
+#include <linux/kp30.h>
+
+#define PORTAL_MINOR 240
+
+extern void (kping_client)(struct portal_ioctl_data *);
+
+struct nal_cmd_handler {
+        nal_cmd_handler_t nch_handler;
+        void * nch_private;
+};
+
+static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1];
+struct semaphore nal_cmd_sem;
+
+#ifdef PORTAL_DEBUG
+void
+kportal_assertion_failed (char *expr, char *file, char *func, int line)
+{
+        unsigned long stack = CDEBUG_STACK(stack);
+        portals_debug_msg(0, D_EMERG, file, func, line, stack,
+                          "ASSERTION(%s) failed\n", expr);
+        LBUG();
+}
+#endif
+
+void
+kportal_daemonize (char *str) 
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+}
+
+void
+kportal_blockallsigs ()
+{
+        unsigned long  flags;
+
+        spin_lock_irqsave (&current->sigmask_lock, flags);
+        siginitsetinv (&current->blocked, 0);
+        recalc_sigpending (current);
+        spin_unlock_irqrestore (&current->sigmask_lock, flags);
+}
+
+/* called when opening /dev/device */
+static int kportal_psdev_open(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+        PORTAL_MODULE_USE;
+        RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int kportal_psdev_release(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+
+        PORTAL_MODULE_UNUSE;
+        RETURN(0);
+}
+
+static inline void freedata(void *data, int len)
+{
+        PORTAL_FREE(data, len);
+}
+
+static int
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+                  ptl_nid_t hi_nid)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_del_route(ptl_nid_t target)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_del_route (target);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
+                  ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+{
+        int       gateway_nalid;
+        ptl_nid_t gateway_nid;
+        ptl_nid_t lo_nid;
+        ptl_nid_t hi_nid;
+        int       rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
+                                 &hi_nid);
+
+        if (rc == 0) {
+                CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
+                       index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+                *gateway_nalidp = (__u32)gateway_nalid;
+                *gateway_nidp   = (__u32)gateway_nid;
+                *lo_nidp        = (__u32)lo_nid;
+                *hi_nidp        = (__u32)hi_nid;
+        }
+
+        PORTAL_SYMBOL_PUT (kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_nal_cmd(int nal, struct portal_ioctl_data *data)
+{
+        int rc = -EINVAL;
+
+        ENTRY;
+
+        down(&nal_cmd_sem);
+        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd);
+                rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private);
+        }
+        up(&nal_cmd_sem);
+        RETURN(rc);
+}
+
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                return (PORTAL_SYMBOL_GET(kqswnal_ni));
+        case SOCKNAL:
+                return (PORTAL_SYMBOL_GET(ksocknal_ni));
+        case TOENAL:
+                return  (PORTAL_SYMBOL_GET(ktoenal_ni));
+        case GMNAL:
+                return  (PORTAL_SYMBOL_GET(kgmnal_ni));
+        case TCPNAL:
+                /* userspace NAL */
+                return (NULL);
+        case SCIMACNAL:
+                return  (PORTAL_SYMBOL_GET(kscimacnal_ni));
+        default:
+                /* A warning to a naive caller */
+                CERROR ("unknown nal: %d\n", nal);
+                return (NULL);
+        }
+}
+
+void
+kportal_put_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                PORTAL_SYMBOL_PUT(kqswnal_ni);
+                break;
+        case SOCKNAL:
+                PORTAL_SYMBOL_PUT(ksocknal_ni);
+                break;
+        case TOENAL:
+                PORTAL_SYMBOL_PUT(ktoenal_ni);
+                break;
+        case GMNAL:
+                PORTAL_SYMBOL_PUT(kgmnal_ni);
+                break;
+        case TCPNAL:
+                /* A lesson to a malicious caller */
+                LBUG ();
+        case SCIMACNAL:
+                PORTAL_SYMBOL_PUT(kscimacnal_ni);
+                break;
+        default:
+                CERROR ("unknown nal: %d\n", nal);
+        }
+}
+
+int
+kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                if (nal_cmd[nal].nch_handler != NULL)
+                        rc = -EBUSY;
+                else {
+                        nal_cmd[nal].nch_handler = handler;
+                        nal_cmd[nal].nch_private = private;
+                }
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+int
+kportal_nal_unregister(int nal)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                nal_cmd[nal].nch_handler = NULL;
+                nal_cmd[nal].nch_private = NULL;
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+
+static int kportal_ioctl(struct inode *inode, struct file *file,
+                         unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        char buf[1024];
+        struct portal_ioctl_data *data;
+
+        ENTRY;
+
+        if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE ||
+             _IOC_NR(cmd) < IOC_PORTAL_MIN_NR  ||
+             _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) {
+                CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                                _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+                RETURN(-EINVAL);
+        }
+
+        if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+                CERROR("PORTALS ioctl: data error\n");
+                RETURN(-EINVAL);
+        }
+
+        data = (struct portal_ioctl_data *)buf;
+
+        switch (cmd) {
+        case IOC_PORTAL_SET_DAEMON: 
+                RETURN (portals_debug_set_daemon ( 
+                                        (unsigned int) data->ioc_count,
+                                        (unsigned int) data->ioc_inllen1,
+                                        (char *) data->ioc_inlbuf1,
+                                        (unsigned int) data->ioc_misc)); 
+        case IOC_PORTAL_GET_DEBUG: {
+                __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1,
+                                                        data->ioc_plen1);
+
+                if (size < 0)
+                        RETURN(size);
+
+                data->ioc_size = size;
+                err = copy_to_user((char *)arg, data, sizeof(*data));
+                RETURN(err);
+        }
+        case IOC_PORTAL_CLEAR_DEBUG:
+                portals_debug_clear_buffer();
+                RETURN(0);
+        case IOC_PORTAL_PANIC:
+                if (!capable (CAP_SYS_BOOT))
+                        RETURN (-EPERM);
+                panic("debugctl-invoked panic");
+                RETURN(0);
+        case IOC_PORTAL_MARK_DEBUG:
+                if (data->ioc_inlbuf1 == NULL ||
+                    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                        RETURN(-EINVAL);
+                portals_debug_mark_buffer(data->ioc_inlbuf1);
+                RETURN(0);
+        case IOC_PORTAL_PING: {
+                void (*ping)(struct portal_ioctl_data *);
+
+                CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n",
+                       data->ioc_count, data->ioc_nid);
+                ping = PORTAL_SYMBOL_GET(kping_client);
+                if (!ping)
+                        CERROR("PORTAL_SYMBOL_GET failed\n");
+                else {
+                        ping(data);
+                        PORTAL_SYMBOL_PUT(kping_client);
+                }
+                RETURN(0);
+        }
+
+        case IOC_PORTAL_ADD_ROUTE:
+                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                       data->ioc_nal, data->ioc_nid, data->ioc_nid2,
+                       data->ioc_nid3);
+                err = kportal_add_route(data->ioc_nal, data->ioc_nid,
+                                        MIN (data->ioc_nid2, data->ioc_nid3),
+                                        MAX (data->ioc_nid2, data->ioc_nid3));
+                break;
+
+        case IOC_PORTAL_DEL_ROUTE:
+                CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
+                err = kportal_del_route (data->ioc_nid);
+                break;
+
+        case IOC_PORTAL_GET_ROUTE:
+                CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
+                err = kportal_get_route(data->ioc_count, &data->ioc_nal,
+                                        &data->ioc_nid, &data->ioc_nid2,
+                                        &data->ioc_nid3);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_GET_NID: {
+                const ptl_handle_ni_t *nip;
+                ptl_process_id_t       pid;
+
+                CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        RETURN (-EINVAL);
+
+                err = PtlGetId (*nip, &pid);
+                LASSERT (err == PTL_OK);
+                kportal_put_ni (data->ioc_nal);
+
+                data->ioc_nid = pid.nid;
+                if (copy_to_user ((char *)arg, data, sizeof (*data)))
+                        err = -EFAULT;
+                break;
+        }
+
+        case IOC_PORTAL_NAL_CMD:
+                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal,
+                        data->ioc_nal_cmd);
+                err = kportal_nal_cmd(data->ioc_nal, data);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_FAIL_NID: {
+                const ptl_handle_ni_t *nip;
+
+                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
+                        data->ioc_nal, data->ioc_nid, data->ioc_count);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        return (-EINVAL);
+
+                err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count);
+                break;
+        }
+
+        default:
+                err = -EINVAL;
+                break;
+        }
+
+        RETURN(err);
+}
+
+
+static struct file_operations portalsdev_fops = {
+        ioctl:   kportal_ioctl,
+        open:    kportal_psdev_open,
+        release: kportal_psdev_release
+};
+
+
+static struct miscdevice portal_dev = {
+        PORTAL_MINOR,
+        "portals",
+        &portalsdev_fops
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+static int init_kportals_module(void)
+{
+        int rc;
+
+        rc = portals_debug_init(5 * 1024 * 1024);
+        if (rc < 0) {
+                printk(KERN_ERR "portals_debug_init: %d\n", rc);
+                return (rc);
+        }
+
+        sema_init(&nal_cmd_sem, 1);
+
+        rc = misc_register(&portal_dev);
+        if (rc) {
+                CERROR("misc_register: error %d\n", rc);
+                goto cleanup_debug;
+        }
+
+        rc = PtlInit();
+        if (rc) {
+                CERROR("PtlInit: error %d\n", rc);
+                goto cleanup_deregister;
+        }
+
+        rc = insert_proc();
+        if (rc) {
+                CERROR("insert_proc: error %d\n", rc);
+                goto cleanup_fini;
+        }
+
+        CDEBUG (D_OTHER, "portals setup OK\n");
+        return (0);
+
+ cleanup_fini:
+        PtlFini();
+ cleanup_deregister:
+        misc_deregister(&portal_dev);
+ cleanup_debug:
+        portals_debug_cleanup();
+        return rc;
+}
+
+static void exit_kportals_module(void)
+{
+        int rc;
+
+        remove_proc();
+        PtlFini();
+
+        CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+
+        rc = misc_deregister(&portal_dev);
+        if (rc)
+                CERROR("misc_deregister error %d\n", rc);
+
+        if (atomic_read(&portal_kmemory) != 0)
+                CERROR("Portals memory leaked: %d bytes\n",
+                       atomic_read(&portal_kmemory));
+
+        rc = portals_debug_cleanup();
+        if (rc)
+                printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+}
+
+EXPORT_SYMBOL(lib_dispatch);
+EXPORT_SYMBOL(PtlMEAttach);
+EXPORT_SYMBOL(PtlMEInsert);
+EXPORT_SYMBOL(PtlMEUnlink);
+EXPORT_SYMBOL(PtlEQAlloc);
+EXPORT_SYMBOL(PtlMDAttach);
+EXPORT_SYMBOL(PtlMDUnlink);
+EXPORT_SYMBOL(PtlNIInit);
+EXPORT_SYMBOL(PtlNIFini);
+EXPORT_SYMBOL(PtlNIDebug);
+EXPORT_SYMBOL(PtlInit);
+EXPORT_SYMBOL(PtlFini);
+EXPORT_SYMBOL(PtlPut);
+EXPORT_SYMBOL(PtlGet);
+EXPORT_SYMBOL(ptl_err_str);
+EXPORT_SYMBOL(portal_subsystem_debug);
+EXPORT_SYMBOL(portal_debug);
+EXPORT_SYMBOL(portal_stack);
+EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(PtlEQWait);
+EXPORT_SYMBOL(PtlEQFree);
+EXPORT_SYMBOL(PtlEQGet);
+EXPORT_SYMBOL(PtlGetId);
+EXPORT_SYMBOL(PtlMDBind);
+EXPORT_SYMBOL(lib_iov_nob);
+EXPORT_SYMBOL(lib_copy_iov2buf);
+EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_kiov_nob);
+EXPORT_SYMBOL(lib_copy_kiov2buf);
+EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_finalize);
+EXPORT_SYMBOL(lib_parse);
+EXPORT_SYMBOL(lib_init);
+EXPORT_SYMBOL(lib_fini);
+EXPORT_SYMBOL(portal_kmemory);
+EXPORT_SYMBOL(kportal_daemonize);
+EXPORT_SYMBOL(kportal_blockallsigs);
+EXPORT_SYMBOL(kportal_nal_register);
+EXPORT_SYMBOL(kportal_nal_unregister);
+EXPORT_SYMBOL(kportal_assertion_failed);
+EXPORT_SYMBOL(dispatch_name);
+EXPORT_SYMBOL(kportal_get_ni);
+EXPORT_SYMBOL(kportal_put_ni);
+
+module_init(init_kportals_module);
+module_exit (exit_kportals_module);
diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c
new file mode 100644 (file)
index 0000000..2fa739a
--- /dev/null
@@ -0,0 +1,290 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+#include <asm/div64.h>
+
+static struct ctl_table_header *portals_table_header = NULL;
+extern char debug_file_path[1024];
+extern char debug_daemon_file_path[1024];
+extern char portals_upcall[1024];
+
+#define PSDEV_PORTALS  (0x100)
+#define PSDEV_DEBUG           1   /* control debugging */
+#define PSDEV_SUBSYSTEM_DEBUG 2   /* control debugging */
+#define PSDEV_PRINTK          3   /* force all errors to console */
+#define PSDEV_DEBUG_PATH      4   /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 5   /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL  6   /* User mode upcall script  */
+
+#define PORTALS_PRIMARY_CTLCNT 6
+static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
+        {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+        {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
+         sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table},
+        {0}
+};
+
+
+#ifdef PORTALS_PROFILING
+/*
+ * profiling stuff.  we do this statically for now 'cause its simple,
+ * but we could do some tricks with elf sections to have this array
+ * automatically built.
+ */
+#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
+
+struct prof_ent prof_ents[] = {
+        def_prof(our_recvmsg),
+        def_prof(our_sendmsg),
+        def_prof(socknal_recv),
+        def_prof(lib_parse),
+        def_prof(conn_list_walk),
+        def_prof(memcpy),
+        def_prof(lib_finalize),
+        def_prof(pingcli_time),
+        def_prof(gmnal_send),
+        def_prof(gmnal_recv),
+};
+
+EXPORT_SYMBOL(prof_ents);
+
+/*
+ * this function is as crazy as the proc filling api
+ * requires.
+ *
+ * buffer: page allocated for us to scribble in.  the
+ *  data returned to the user will be taken from here.
+ * *start: address of the pointer that will tell the 
+ *  caller where in buffer the data the user wants is.
+ * ppos: offset in the entire /proc file that the user
+ *  currently wants.
+ * wanted: the amount of data the user wants.
+ *
+ * while going, 'curpos' is the offset in the entire
+ * file where we currently are.  We only actually
+ * start filling buffer when we get to a place in
+ * the file that the user cares about.
+ *
+ * we take care to only sprintf when the user cares because
+ * we're holding a lock while we do this.
+ *
+ * we're smart and know that we generate fixed size lines.
+ * we only start writing to the buffer when the user cares.
+ * This is unpredictable because we don't snapshot the
+ * list between calls that are filling in a file from
+ * the list.  The list could change mid read and the
+ * output will look very weird indeed.  oh well.
+ */
+
+static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
+                          int *eof, void *data)
+{
+        int len = 0, i;
+        int curpos;
+        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
+        int header_len = strlen(header);
+        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
+        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
+
+        *start = buffer;
+
+        if (ppos < header_len) {
+                int diff = MIN(header_len, wanted);
+                memcpy(buffer, header + ppos, diff);
+                len += diff;
+                ppos += diff;
+        }
+
+        if (len >= wanted)
+                goto out;
+
+        curpos = header_len;
+
+        for ( i = 0; i < MAX_PROFS ; i++) {
+                int copied;
+                struct prof_ent *pe = &prof_ents[i];
+                long long cycles_per;
+                /*
+                 * find the part of the array that the buffer wants
+                 */
+                if (ppos >= (curpos + line_len))  {
+                        curpos += line_len;
+                        continue;
+                }
+                /* the clever caller split a line */
+                if (ppos > curpos) {
+                        *start = buffer + (ppos - curpos);
+                }
+
+                if (pe->finishes == 0)
+                        cycles_per = 0;
+                else
+                {
+                        cycles_per = pe->total_cycles;
+                        do_div (cycles_per, pe->finishes);
+                }
+
+                copied = sprintf(buffer + len, format, pe->str, cycles_per,
+                                 pe->starts, pe->finishes, pe->total_cycles);
+
+                len += copied;
+
+                /* pad to line len, -1 for \n */
+                if ((copied < line_len-1)) {
+                        int diff = (line_len-1) - copied;
+                        memset(buffer + len, ' ', diff);
+                        len += diff;
+                        copied += diff;
+                }
+
+                buffer[len++]= '\n';
+
+                /* bail if we have enough */
+                if (((buffer + len) - *start) >= wanted)
+                        break;
+
+                curpos += line_len;
+        }
+
+        /* lameness */
+        if (i == MAX_PROFS)
+                *eof = 1;
+ out:
+
+        return MIN(((buffer + len) - *start), wanted);
+}
+
+/*
+ * all kids love /proc :/
+ */
+static unsigned char basedir[]="net/portals";
+#endif /* PORTALS_PROFILING */
+
+int insert_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        struct proc_dir_entry *ent;
+
+        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
+                CERROR("profiling enum and array are out of sync.\n");
+                return -1;
+        }
+
+        /*
+         * This is pretty lame.  assuming that failure just
+         * means that they already existed.
+         */
+        strcat(dir, basedir);
+        create_proc_entry(dir, S_IFDIR, 0);
+
+        strcat(dir, "/cycles");
+        ent = create_proc_entry(dir, 0, 0);
+        if (!ent) {
+                CERROR("couldn't register %s?\n", dir);
+                return -1;
+        }
+
+        ent->data = NULL;
+        ent->read_proc = prof_read_proc;
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (!portals_table_header)
+                portals_table_header = register_sysctl_table(top_table, 0);
+#endif
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        int end;
+
+        dir[0]='\0';
+        strcat(dir, basedir);
+
+        end = strlen(dir);
+
+        strcat(dir, "/cycles");
+        remove_proc_entry(dir,0);
+
+        dir[end] = '\0';
+        remove_proc_entry(dir,0);
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
diff --git a/lustre/portals/packaging/.cvsignore b/lustre/portals/packaging/.cvsignore
new file mode 100644 (file)
index 0000000..fd1d56a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+aclocal.m4
+config.log
+config.status
+config.cache
+configure
+portals.spec
diff --git a/lustre/portals/packaging/Makefile.am b/lustre/portals/packaging/Makefile.am
new file mode 100644 (file)
index 0000000..126bc69
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = portals.spec
\ No newline at end of file
diff --git a/lustre/portals/packaging/portals.spec.in b/lustre/portals/packaging/portals.spec.in
new file mode 100644 (file)
index 0000000..e196b3f
--- /dev/null
@@ -0,0 +1,116 @@
+%define kversion @RELEASE@
+%define linuxdir @LINUX@
+%define version HEAD
+
+Summary: Sandia Portals Message Passing - utilities 
+Name: portals
+Version: %{version}
+Release: 0210101748uml
+Copyright: LGPL
+Group: Utilities/System
+BuildRoot: /var/tmp/portals-%{version}-root
+Source: http://sandiaportals.org/portals-%{version}.tar.gz
+
+%description
+Sandia Portals message passing package.  Contains kernel modules, libraries and utilities. 
+
+%package -n portals-modules
+Summary: Kernel modules and NAL's for portals
+Group: Development/Kernel
+
+%description -n portals-modules
+Object-Based Disk storage drivers for Linux %{kversion}.
+
+%package -n portals-source
+Summary: Portals kernel source for rebuilding with other kernels
+Group: Development/Kernel
+
+%description -n portals-source
+Portals kernel source for rebuilding with other kernels
+
+%prep
+%setup -n portals-%{version}
+
+%build
+rm -rf $RPM_BUILD_ROOT
+
+# Create the pristine source directory.
+srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version}
+mkdir -p $srcdir
+find . -name CVS -prune -o -print | cpio -ap $srcdir
+
+# Set an explicit path to our Linux tree, if we can.
+conf_flag=
+linuxdir=%{linuxdir}
+test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+./configure $conf_flag
+make 
+
+%install
+make install prefix=$RPM_BUILD_ROOT
+
+%ifarch alpha
+# this hurts me
+  conf_flag=
+  linuxdir=%{linuxdir}
+  test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+  make clean
+  ./configure --enable-rtscts-myrinet $conf_flag
+  make
+  cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o
+  cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload
+%endif
+
+
+%files
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /usr/sbin/acceptor
+%attr(-, root, root) /usr/sbin/ptlctl
+%attr(-, root, root) /usr/sbin/debugctl
+%ifarch alpha
+%attr(-, root, root) /usr/sbin/mcpload
+%endif
+%attr(-, root, root) /lib/libmyrnal.a
+%attr(-, root, root) /lib/libptlapi.a
+%attr(-, root, root) /lib/libptlctl.a
+%attr(-, root, root) /lib/libprocbridge.a
+%attr(-, root, root) /lib/libptllib.a
+%attr(-, root, root) /lib/libtcpnal.a 
+%attr(-, root, root) /lib/libtcpnalutil.a
+%attr(-, root, root) /usr/include/portals/*.h
+%attr(-, root, root) /usr/include/portals/base/*.h
+%attr(-, root, root) /usr/include/linux/*.h
+
+%files -n portals-modules
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o
+%ifarch alpha
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o
+%endif
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o
+
+%files -n portals-source
+%attr(-, root, root) /usr/src/portals-%{version}
+
+%post
+if [ ! -e /dev/portals ]; then
+   mknod /dev/portals c 10 240
+fi
+depmod -ae || exit 0
+
+grep -q portals /etc/modules.conf || \
+       echo 'alias char-major-10-240 portals' >> /etc/modules.conf
+
+grep -q '/dev/portals' /etc/modules.conf || \
+       echo 'alias /dev/portals portals' >> /etc/modules.conf
+
+%postun
+depmod -ae || exit 0
+
+%clean
+#rm -rf $RPM_BUILD_ROOT
+
+# end of file
diff --git a/lustre/portals/portals/Makefile.am b/lustre/portals/portals/Makefile.am
new file mode 100644 (file)
index 0000000..9fb7f6f
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
+lib_LIBRARIES= libportals.a
+libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-md.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-not-impl.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk
new file mode 100644 (file)
index 0000000..5627ef7
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += portals.o
+portals-objs    := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o
diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c
new file mode 100644 (file)
index 0000000..57427f6
--- /dev/null
@@ -0,0 +1,161 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-eq.c
+ * User-level event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * PtlMDUpdate is here so that it can access the per-eventq
+ * structures.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_eq_init(void)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_fini(void)
+{
+        /* Nothing to do anymore... */
+}
+
+int ptl_eq_ni_init(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_ni_fini(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+}
+
+int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
+{
+        ptl_eq_t *eq;
+        int rc, new_index;
+        unsigned long flags;
+        ptl_event_t *new_event;
+        nal_t *nal;
+        ENTRY;
+
+        if (!ptl_init)
+                RETURN(PTL_NOINIT);
+
+        nal = ptl_hndl2nal(&eventq);
+        if (!nal)
+                RETURN(PTL_INV_EQ);
+
+        eq = ptl_handle2usereq(&eventq);
+        nal->lock(nal, &flags);
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+
+        new_index = eq->sequence & (eq->size - 1);
+        new_event = &eq->base[new_index];
+        CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n",
+               new_event, eq->sequence, eq->size);
+        if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) {
+                nal->unlock(nal, &flags);
+                RETURN(PTL_EQ_EMPTY);
+        }
+
+        *ev = *new_event;
+
+        /* Set the unlinked_me interface number if there is one to pass
+         * back, since the NAL hasn't a clue what it is and therefore can't
+         * set it. */
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
+                ev->unlinked_me.nal_idx = eventq.nal_idx;
+        
+        /* ensure event is delivered correctly despite possible 
+           races with lib_finalize */
+        if (eq->sequence != new_event->sequence) {
+                CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n",
+                       eq->sequence, new_event->sequence);
+                rc = PTL_EQ_DROPPED;
+        } else {
+                rc = PTL_OK;
+        }
+
+        eq->sequence = new_event->sequence + 1;
+        nal->unlock(nal, &flags);
+        RETURN(rc);
+}
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
+{
+        int rc;
+        
+        /* PtlEQGet does the handle checking */
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+        }
+
+        return rc;
+}
+
+#ifndef __KERNEL__
+static jmp_buf eq_jumpbuf;
+
+static void eq_timeout(int signal)
+{
+        longjmp(eq_jumpbuf, -1);
+}
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        static void (*prev) (int);
+        static int left_over;
+        time_t time_at_start;
+        int rc;
+
+        if (setjmp(eq_jumpbuf)) {
+                signal(SIGALRM, prev);
+                alarm(left_over - timeout);
+                return PTL_EQ_EMPTY;
+        }
+
+        left_over = alarm(timeout);
+        prev = signal(SIGALRM, eq_timeout);
+        time_at_start = time(NULL);
+        if (left_over < timeout)
+                alarm(left_over);
+
+        rc = PtlEQWait(eventq_in, event_out);
+
+        signal(SIGALRM, prev);
+        alarm(left_over);       /* Should compute how long we waited */
+
+        return rc;
+}
+
+#endif
+
diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c
new file mode 100644 (file)
index 0000000..5cb0980
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-errno.c
+ * Instantiate the string table of errors
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+/* If you change these, you must update the number table in portals/errno.h */
+const char *ptl_err_str[] = {
+        "PTL_OK",
+        "PTL_SEGV",
+
+        "PTL_NOSPACE",
+        "PTL_INUSE",
+        "PTL_VAL_FAILED",
+
+        "PTL_NAL_FAILED",
+        "PTL_NOINIT",
+        "PTL_INIT_DUP",
+        "PTL_INIT_INV",
+        "PTL_AC_INV_INDEX",
+
+        "PTL_INV_ASIZE",
+        "PTL_INV_HANDLE",
+        "PTL_INV_MD",
+        "PTL_INV_ME",
+        "PTL_INV_NI",
+/* If you change these, you must update the number table in portals/errno.h */
+        "PTL_ILL_MD",
+        "PTL_INV_PROC",
+        "PTL_INV_PSIZE",
+        "PTL_INV_PTINDEX",
+        "PTL_INV_REG",
+
+        "PTL_INV_SR_INDX",
+        "PTL_ML_TOOLONG",
+        "PTL_ADDR_UNKNOWN",
+        "PTL_INV_EQ",
+        "PTL_EQ_DROPPED",
+
+        "PTL_EQ_EMPTY",
+        "PTL_NOUPDATE",
+        "PTL_FAIL",
+        "PTL_NOT_IMPLEMENTED",
+        "PTL_NO_ACK",
+
+        "PTL_IOV_TOO_MANY",
+        "PTL_IOV_TOO_SMALL",
+
+        "PTL_EQ_INUSE",
+        "PTL_MD_INUSE"
+};
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c
new file mode 100644 (file)
index 0000000..b54f684
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-init.c
+ * Initialization and global data for the p30 user side library
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * All handles have their interface number stored in the second 16 bit word
+ */
+
+#include <portals/api-support.h>
+
+int ptl_init;
+unsigned int portal_subsystem_debug = 0xfff7e3ff;
+unsigned int portal_debug = ~0;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+#ifdef __KERNEL__
+atomic_t portal_kmemory = ATOMIC_INIT(0);
+#endif
+
+int __p30_initialized;
+int __p30_myr_initialized;
+int __p30_ip_initialized;
+ptl_handle_ni_t __myr_ni_handle;
+ptl_handle_ni_t __ip_ni_handle;
+
+int __p30_myr_timeout = 10;
+int __p30_ip_timeout;
+
+int PtlInit(void)
+{
+
+        if (ptl_init)
+                return PTL_OK;
+
+        ptl_ni_init();
+        ptl_me_init();
+        ptl_eq_init();
+        ptl_init = 1;
+        __p30_initialized = 1;
+
+        return PTL_OK;
+}
+
+
+void PtlFini(void)
+{
+
+        /* Reverse order of initialization */
+        ptl_eq_fini();
+        ptl_me_fini();
+        ptl_ni_fini();
+        ptl_init = 0;
+}
diff --git a/lustre/portals/portals/api-md.c b/lustre/portals/portals/api-md.c
new file mode 100644 (file)
index 0000000..967112f
--- /dev/null
@@ -0,0 +1,9 @@
+/*
+ * api-p30/md.c
+ *
+ * Memory descriptor functions that need address validation
+ * There are a few standing issues...
+ *  - Addresses are invalidated by the library without telling us.
+ */
+#include <portals/api-support.h>
+
diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c
new file mode 100644 (file)
index 0000000..573e948
--- /dev/null
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-me.c
+ * Match Entry local operations.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_me_init(void)
+{
+        return PTL_OK;
+}
+void ptl_me_fini(void)
+{                                /* Nothing to do */
+}
+int ptl_me_ni_init(nal_t * nal)
+{
+        return PTL_OK;
+}
+
+void ptl_me_ni_fini(nal_t * nal)
+{                                /* Nothing to do... */
+}
diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c
new file mode 100644 (file)
index 0000000..952da4f
--- /dev/null
@@ -0,0 +1,184 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-ni.c
+ * Network Interface code
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+#define MAX_NIS 8
+static nal_t *ptl_interfaces[MAX_NIS];
+int ptl_num_interfaces = 0;
+
+nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
+{
+        unsigned int idx = handle->nal_idx;
+
+        /* XXX we really rely on the caller NOT racing with interface
+         * setup/teardown.  That ensures her NI handle can't get
+         * invalidated out from under her (or worse, swapped for a
+         * completely different interface!) */
+        
+        if (idx < MAX_NIS)
+                return ptl_interfaces[idx];
+
+        return NULL;
+}
+
+int ptl_ni_init(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++)
+                ptl_interfaces[i] = NULL;
+
+        return PTL_OK;
+}
+
+void ptl_ni_fini(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++) {
+                nal_t *nal = ptl_interfaces[i];
+                if (!nal)
+                        continue;
+
+                if (nal->shutdown)
+                        nal->shutdown(nal, i);
+        }
+}
+
+#ifdef __KERNEL__
+DECLARE_MUTEX(ptl_ni_init_mutex);
+
+static void ptl_ni_init_mutex_enter (void) 
+{
+        down (&ptl_ni_init_mutex);
+}
+
+static void ptl_ni_init_mutex_exit (void)
+{
+        up (&ptl_ni_init_mutex);
+}
+
+#else
+static void ptl_ni_init_mutex_enter (void)
+{
+}
+
+static void ptl_ni_init_mutex_exit (void) 
+{
+}
+
+#endif
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t acl_size, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * handle)
+{
+        nal_t *nal;
+        int i;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid);
+
+        if (!nal) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_NAL_FAILED;
+        }
+
+        for (i = 0; i < ptl_num_interfaces; i++) {
+                if (ptl_interfaces[i] == nal) {
+                        nal->refct++;
+                        handle->nal_idx = i;
+                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        ptl_ni_init_mutex_exit ();
+                        return PTL_OK;
+                }
+        }
+        nal->refct = 1;
+
+        handle->nal_idx = ptl_num_interfaces;
+        if (ptl_num_interfaces >= MAX_NIS) {
+                if (nal->shutdown)
+                        nal->shutdown (nal, ptl_num_interfaces);
+                ptl_ni_init_mutex_exit ();
+                return PTL_NOSPACE;
+        }
+
+        ptl_interfaces[ptl_num_interfaces++] = nal;
+
+        ptl_eq_ni_init(nal);
+        ptl_me_ni_init(nal);
+
+        ptl_ni_init_mutex_exit ();
+        return PTL_OK;
+}
+
+
+int PtlNIFini(ptl_handle_ni_t ni)
+{
+        nal_t *nal;
+        int rc;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = ptl_hndl2nal (&ni);
+        if (nal == NULL) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_INV_HANDLE;
+        }
+
+        nal->refct--;
+        if (nal->refct > 0) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_OK;
+        }
+
+        ptl_me_ni_fini(nal);
+        ptl_eq_ni_fini(nal);
+
+        rc = PTL_OK;
+        if (nal->shutdown)
+                rc = nal->shutdown(nal, ni.nal_idx);
+
+        ptl_interfaces[ni.nal_idx] = NULL;
+        ptl_num_interfaces--;
+
+        ptl_ni_init_mutex_exit ();
+        return rc;
+}
+
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out)
+{
+        *ni_out = handle_in;
+
+        return PTL_OK;
+}
diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c
new file mode 100644 (file)
index 0000000..cbd4d1f
--- /dev/null
@@ -0,0 +1,601 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-wrap.c
+ * User-level wrappers that dispatch across the protection boundaries
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Assumes the handle encodes the network number in the second 16 bit word
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/api-support.h>
+
+static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
+                      int argsize, void *retbuf, int retsize)
+{
+        nal_t *nal;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlGetId: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&any_h);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize);
+
+        return PTL_OK;
+}
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
+{
+        PtlGetId_in args;
+        PtlGetId_out ret;
+        int rc;
+
+        args.handle_in = ni_handle;
+
+        rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return rc;
+        
+        if (id)
+                *id = ret.id_out;
+
+        return ret.rc;
+}
+
+int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
+{
+        PtlFailNid_in  args;
+        PtlFailNid_out ret;
+        int            rc;
+        
+        args.interface = interface;
+        args.nid       = nid;
+        args.threshold = threshold;
+        
+        rc = do_forward (interface, PTL_FAILNID, 
+                         &args, sizeof(args), &ret, sizeof (ret));
+
+        return ((rc != PTL_OK) ? rc : ret.rc);
+}
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out)
+{
+        PtlNIStatus_in args;
+        PtlNIStatus_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.register_in = register_in;
+
+        rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (status_out)
+                *status_out = ret.status_out;
+
+        return ret.rc;
+}
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out)
+{
+        PtlNIDist_in args;
+        PtlNIDist_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.process_in = process_in;
+
+        rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (distance_out)
+                *distance_out = ret.distance_out;
+
+        return ret.rc;
+}
+
+
+
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in)
+{
+        PtlNIDebug_in args;
+        PtlNIDebug_out ret;
+        int rc;
+
+        args.mask_in = mask_in;
+
+        rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out)
+{
+        PtlMEAttach_in args;
+        PtlMEAttach_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = pos_in;
+
+        rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = interface_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+
+        return ret.rc;
+}
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out)
+{
+        PtlMEInsert_in args;
+        PtlMEInsert_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = position_in;
+
+        rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = current_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMEUnlink(ptl_handle_me_t current_in)
+{
+        PtlMEUnlink_in args;
+        PtlMEUnlink_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.unlink_in = PTL_RETAIN;
+
+        rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in)
+{
+        PtlTblDump_in args;
+        PtlTblDump_out ret;
+        int rc;
+
+        args.index_in = index_in;
+
+        rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEDump(ptl_handle_me_t current_in)
+{
+        PtlMEDump_in args;
+        PtlMEDump_out ret;
+        int rc;
+
+        args.current_in = current_in;
+
+        rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
+{
+        nal_t *nal;
+        int rc;
+        int i;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&current_in);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        if (nal->validate != NULL)                /* nal->validate not a NOOP */
+        {
+                if ((md_in.options & PTL_MD_IOV) == 0)        /* contiguous */
+                {
+                        rc = nal->validate (nal, md_in.start, md_in.length);
+                        if (rc)
+                                return (PTL_SEGV);
+                }
+                else
+                {
+                        struct iovec *iov = (struct iovec *)md_in.start;
+
+                        for (i = 0; i < md_in.niov; i++, iov++)
+                        {
+                                rc = nal->validate (nal, iov->iov_base, iov->iov_len);
+                                if (rc)
+                                        return (PTL_SEGV);
+                        }
+                }
+        }
+
+        return 0;
+}
+
+static ptl_handle_eq_t md2eq (ptl_md_t *md)
+{
+        if (PtlHandleEqual (md->eventq, PTL_EQ_NONE))
+                return (PTL_EQ_NONE);
+        
+        return (ptl_handle2usereq (&md->eventq)->cb_eq_handle);
+}
+
+
+int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
+{
+        PtlMDAttach_in args;
+        PtlMDAttach_out ret;
+        int rc;
+
+        rc = validate_md(me_in, md_in);
+        if (rc == PTL_OK) {
+                args.eq_in = md2eq(&md_in);
+                args.me_in = me_in;
+                args.md_in = md_in;
+                args.unlink_in = unlink_in;
+                
+                rc = do_forward(me_in, PTL_MDATTACH, 
+                                &args, sizeof(args), &ret, sizeof(ret));
+        }
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = me_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+                       ptl_handle_md_t * handle_out)
+{
+        PtlMDBind_in args;
+        PtlMDBind_out ret;
+        int rc;
+
+        rc = validate_md(ni_in, md_in);
+        if (rc != PTL_OK)
+                return rc;
+
+        args.eq_in = md2eq(&md_in);
+        args.ni_in = ni_in;
+        args.md_in = md_in;
+
+        rc = do_forward(ni_in, PTL_MDBIND, 
+                        &args, sizeof(args), &ret, sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = ni_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
+                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
+{
+        PtlMDUpdate_internal_in args;
+        PtlMDUpdate_internal_out ret;
+        int rc;
+
+        args.md_in = md_in;
+
+        if (old_inout) {
+                args.old_inout = *old_inout;
+                args.old_inout_valid = 1;
+        } else
+                args.old_inout_valid = 0;
+
+        if (new_inout) {
+                rc = validate_md (md_in, *new_inout);
+                if (rc != PTL_OK)
+                        return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+                args.new_inout = *new_inout;
+                args.new_inout_valid = 1;
+        } else
+                args.new_inout_valid = 0;
+
+        if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) {
+                args.testq_in = PTL_EQ_NONE;
+                args.sequence_in = -1;
+        } else {
+                ptl_eq_t *eq = ptl_handle2usereq (&testq_in);
+                
+                args.testq_in = eq->cb_eq_handle;
+                args.sequence_in = eq->sequence;
+        }
+
+        rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        if (old_inout)
+                *old_inout = ret.old_inout;
+
+        return ret.rc;
+}
+
+int PtlMDUnlink(ptl_handle_md_t md_in)
+{
+        PtlMDUnlink_in args;
+        PtlMDUnlink_out ret;
+        int rc;
+
+        args.md_in = md_in;
+        rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        return ret.rc;
+}
+
+int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out)
+{
+        ptl_eq_t *eq = NULL;
+        ptl_event_t *ev = NULL;
+        PtlEQAlloc_in args;
+        PtlEQAlloc_out ret;
+        int rc, i;
+        nal_t *nal;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+        
+        nal = ptl_hndl2nal (&interface);
+        if (nal == NULL)
+                return PTL_INV_HANDLE;
+
+        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
+                do {                    /* knock off all but the top bit... */
+                        count &= ~LOWEST_BIT_SET (count);
+                } while (count != LOWEST_BIT_SET(count));
+
+                count <<= 1;                             /* ...and round up */
+        }
+
+        if (count == 0)        /* catch bad parameter / overflow on roundup */
+                return (PTL_VAL_FAILED);
+
+        PORTAL_ALLOC(ev, count * sizeof(ptl_event_t));
+        if (!ev)
+                return PTL_NOSPACE;
+
+        for (i = 0; i < count; i++)
+                ev[i].sequence = 0;
+
+        if (nal->validate != NULL) {
+                rc = nal->validate(nal, ev, count * sizeof(ptl_event_t));
+                if (rc != PTL_OK)
+                        goto fail;
+        }
+
+        args.ni_in = interface;
+        args.count_in = count;
+        args.base_in = ev;
+        args.len_in = count * sizeof(*ev);
+        args.callback_in = callback;
+
+        rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                goto fail;
+        if (ret.rc)
+                GOTO(fail, rc = ret.rc);
+
+        PORTAL_ALLOC(eq, sizeof(*eq));
+        if (!eq) {
+                rc = PTL_NOSPACE;
+                goto fail;
+        }
+
+        eq->sequence = 1;
+        eq->size = count;
+        eq->base = ev;
+
+        /* EQ handles are a little wierd.  PtlEQGet() just looks at the
+         * queued events in shared memory.  It doesn't want to do_forward()
+         * at all, so the cookie in the EQ handle we pass out of here is
+         * simply a pointer to the event queue we just set up.  We stash
+         * the handle returned by do_forward(), so we can pass it back via
+         * do_forward() when we need to. */
+
+        eq->cb_eq_handle.nal_idx = interface.nal_idx;
+        eq->cb_eq_handle.cookie = ret.handle_out.cookie;
+
+        handle_out->nal_idx = interface.nal_idx;
+        handle_out->cookie = (__u64)((unsigned long)eq);
+        return PTL_OK;
+
+fail:
+        PORTAL_FREE(ev, count * sizeof(ptl_event_t));
+        return rc;
+}
+
+int PtlEQFree(ptl_handle_eq_t eventq)
+{
+        PtlEQFree_in args;
+        PtlEQFree_out ret;
+        ptl_eq_t *eq;
+        int rc;
+
+        eq = ptl_handle2usereq (&eventq);
+        args.eventq_in = eq->cb_eq_handle;
+
+        rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args,
+                        sizeof(args), &ret, sizeof(ret));
+
+        /* XXX we're betting rc == PTL_OK here */
+        PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t));
+        PORTAL_FREE(eq, sizeof(*eq));
+
+        return rc;
+}
+
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
+{
+        PtlACEntry_in args;
+        PtlACEntry_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.ni_in = ni_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.portal_in = portal_in;
+
+        rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
+{
+        PtlPut_in args;
+        PtlPut_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.ack_req_in = ack_req_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+        args.hdr_data_in = hdr_data_in;
+
+        rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
+{
+        PtlGet_in args;
+        PtlGet_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+
+        rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c
new file mode 100644 (file)
index 0000000..63ed70f
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-dispatch.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/lib-dispatch.h>
+
+typedef struct {
+        int (*fun) (nal_cb_t * nal, void *private, void *in, void *out);
+        char *name;
+} dispatch_table_t;
+
+static dispatch_table_t dispatch_table[] = {
+        [PTL_GETID] {do_PtlGetId, "PtlGetId"},
+        [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"},
+        [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"},
+        [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"},
+        [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"},
+        [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"},
+        [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"},
+        [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"},
+        [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"},
+        [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"},
+        [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"},
+        [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"},
+        [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"},
+        [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"},
+        [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"},
+        [PTL_ACENTRY] {do_PtlACEntry, "PtlACEntry"},
+        [PTL_PUT] {do_PtlPut, "PtlPut"},
+        [PTL_GET] {do_PtlGet, "PtlGet"},
+        [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"},
+        /*    */ {0, ""}
+};
+
+/*
+ * This really should be elsewhere, but lib-p30/dispatch.c is
+ * an automatically generated file.
+ */
+void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block,
+                  void *ret_block)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (index < 0 || index > LIB_MAX_DISPATCH ||
+            !dispatch_table[index].fun) {
+                CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index);
+                return;
+        }
+
+        CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid,
+               dispatch_table[index].name, index);
+
+        dispatch_table[index].fun(nal, private, arg_block, ret_block);
+}
+
+char *dispatch_name(int index)
+{
+        return dispatch_table[index].name;
+}
diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c
new file mode 100644 (file)
index 0000000..4c6c292
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-eq.c
+ * Library level Event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args,
+                           void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_size_t count_in
+         *      void                    * base_in
+         *
+         * Outgoing:
+         *      ptl_handle_eq_t         * handle_out
+         */
+
+        PtlEQAlloc_in *args = v_args;
+        PtlEQAlloc_out *ret = v_ret;
+
+        lib_eq_t *eq;
+        unsigned long flags;
+
+        /* api should have rounded up */
+        if (args->count_in != LOWEST_BIT_SET (args->count_in))
+                return ret->rc = PTL_VAL_FAILED;
+
+        eq = lib_eq_alloc (nal);
+        if (eq == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        if (nal->cb_map != NULL) {
+                struct iovec iov = {
+                        .iov_base = args->base_in,
+                        .iov_len = args->count_in * sizeof (ptl_event_t) };
+
+                ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey);
+                if (ret->rc != PTL_OK) {
+                        lib_eq_free (nal, eq);
+                        
+                        state_unlock (nal, &flags);
+                        return (ret->rc);
+                }
+        }
+
+        eq->sequence = 1;
+        eq->base = args->base_in;
+        eq->size = args->count_in;
+        eq->eq_refcount = 0;
+        eq->event_callback = args->callback_in;
+
+        lib_initialise_handle (nal, &eq->eq_lh);
+        list_add (&eq->eq_list, &nal->ni.ni_active_eqs);
+
+        state_unlock(nal, &flags);
+
+        ptl_eq2handle(&ret->handle_out, eq);
+        return (ret->rc = PTL_OK);
+}
+
+int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args,
+                          void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_eq_t eventq_in
+         *
+         * Outgoing:
+         */
+
+        PtlEQFree_in *args = v_args;
+        PtlEQFree_out *ret = v_ret;
+        lib_eq_t *eq;
+        long flags;
+
+        state_lock (nal, &flags);
+
+        eq = ptl_handle2eq(&args->eventq_in, nal);
+        if (eq == NULL) {
+                ret->rc = PTL_INV_EQ;
+        } else if (eq->eq_refcount != 0) {
+                ret->rc = PTL_EQ_INUSE;
+        } else {
+                if (nal->cb_unmap != NULL) {
+                        struct iovec iov = {
+                                .iov_base = eq->base,
+                                .iov_len = eq->size * sizeof (ptl_event_t) };
+                        
+                        nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey);
+                }
+
+                lib_invalidate_handle (nal, &eq->eq_lh);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock (nal, &flags);
+
+        return (ret->rc);
+}
diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c
new file mode 100644 (file)
index 0000000..40f3d2c
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-init.c
+ * Start up the internal library and clear all structures
+ * Called by the NAL when it initializes.  Safe to call multiple times.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+
+#ifdef __KERNEL__
+# include <linux/string.h>      /* for memset() */
+# include <linux/kp30.h>
+# ifdef KERNEL_ADDR_CACHE
+#  include <compute/OS/addrCache/cache.h>
+# endif
+#else
+# include <string.h>
+# include <sys/time.h>
+#endif
+
+#ifdef PTL_USE_SLAB_CACHE
+static int ptl_slab_users;
+
+kmem_cache_t *ptl_md_slab;
+kmem_cache_t *ptl_msg_slab;
+kmem_cache_t *ptl_me_slab;
+kmem_cache_t *ptl_eq_slab;
+
+atomic_t md_in_use_count;
+atomic_t msg_in_use_count;
+atomic_t me_in_use_count;
+atomic_t eq_in_use_count;
+
+/* NB zeroing in ctor and on freeing ensures items that
+ * kmem_cache_validate() OK, but haven't been initialised
+ * as an MD/ME/EQ can't have valid handles
+ */
+static void
+ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_md_t));
+}
+
+static void
+ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_me_t));
+}
+
+static void
+ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_eq_t));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+
+        /* We'll have 1 set of slabs for ALL the nals :) */
+
+        if (ptl_slab_users++)
+                return 0;
+
+        ptl_md_slab = kmem_cache_create("portals_MD",
+                                        sizeof(lib_md_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_md_slab_ctor, NULL);
+        if (!ptl_md_slab) {
+                CERROR("couldn't allocate ptl_md_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        /* NB no ctor for msgs; they don't need handle verification */
+        ptl_msg_slab = kmem_cache_create("portals_MSG",
+                                         sizeof(lib_msg_t), 0,
+                                         SLAB_HWCACHE_ALIGN,
+                                         NULL, NULL);
+        if (!ptl_msg_slab) {
+                CERROR("couldn't allocate ptl_msg_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_me_slab = kmem_cache_create("portals_ME",
+                                        sizeof(lib_me_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_me_slab_ctor, NULL);
+        if (!ptl_me_slab) {
+                CERROR("couldn't allocate ptl_me_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_eq_slab = kmem_cache_create("portals_EQ",
+                                        sizeof(lib_eq_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_eq_slab_ctor, NULL);
+        if (!ptl_eq_slab) {
+                CERROR("couldn't allocate ptl_eq_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        RETURN(PTL_OK);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        if (--ptl_slab_users != 0)
+                return;
+
+        LASSERT (atomic_read (&md_in_use_count) == 0);
+        LASSERT (atomic_read (&me_in_use_count) == 0);
+        LASSERT (atomic_read (&eq_in_use_count) == 0);
+        LASSERT (atomic_read (&msg_in_use_count) == 0);
+
+        if (ptl_md_slab != NULL)
+                kmem_cache_destroy(ptl_md_slab);
+        if (ptl_msg_slab != NULL)
+                kmem_cache_destroy(ptl_msg_slab);
+        if (ptl_me_slab != NULL)
+                kmem_cache_destroy(ptl_me_slab);
+        if (ptl_eq_slab != NULL)
+                kmem_cache_destroy(ptl_eq_slab);
+}
+#else
+
+int
+lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
+{
+        char *space;
+
+        LASSERT (n > 0);
+
+        size += offsetof (lib_freeobj_t, fo_contents);
+
+        space = nal->cb_malloc (nal, n * size);
+        if (space == NULL)
+                return (PTL_NOSPACE);
+
+        INIT_LIST_HEAD (&fl->fl_list);
+        fl->fl_objs = space;
+        fl->fl_nobjs = n;
+        fl->fl_objsize = size;
+
+        do
+        {
+                memset (space, 0, size);
+                list_add ((struct list_head *)space, &fl->fl_list);
+                space += size;
+        } while (--n != 0);
+
+        return (PTL_OK);
+}
+
+void
+lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
+{
+        struct list_head *el;
+        int               count;
+
+        if (fl->fl_nobjs == 0)
+                return;
+
+        count = 0;
+        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+                count++;
+
+        LASSERT (count == fl->fl_nobjs);
+
+        nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        memset (fl, 0, sizeof (fl));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+        int rc;
+
+        memset (&nal->ni.ni_free_mes,  0, sizeof (nal->ni.ni_free_mes));
+        memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs));
+        memset (&nal->ni.ni_free_mds,  0, sizeof (nal->ni.ni_free_mds));
+        memset (&nal->ni.ni_free_eqs,  0, sizeof (nal->ni.ni_free_eqs));
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mes,
+                                MAX_MES, sizeof (lib_me_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs,
+                                MAX_MSGS, sizeof (lib_msg_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mds,
+                                MAX_MDS, sizeof (lib_md_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs,
+                                MAX_EQS, sizeof (lib_eq_t));
+        return (rc);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        lib_freelist_fini (nal, &nal->ni.ni_free_mes);
+        lib_freelist_fini (nal, &nal->ni.ni_free_msgs);
+        lib_freelist_fini (nal, &nal->ni.ni_free_mds);
+        lib_freelist_fini (nal, &nal->ni.ni_free_eqs);
+}
+
+#endif
+
+__u64
+lib_create_interface_cookie (nal_cb_t *nal)
+{
+        /* NB the interface cookie in wire handles guards against delayed
+         * replies and ACKs appearing valid in a new instance of the same
+         * interface.  Initialisation time, even if it's only implemented
+         * to millisecond resolution is probably easily good enough. */
+        struct timeval tv;
+        __u64          cookie;
+#ifndef __KERNEL__
+        int            rc = gettimeofday (&tv, NULL);
+        LASSERT (rc == 0);
+#else
+       do_gettimeofday(&tv);
+#endif
+        cookie = tv.tv_sec;
+        cookie *= 1000000;
+        cookie += tv.tv_usec;
+        return (cookie);
+}
+
+int
+lib_setup_handle_hash (nal_cb_t *nal) 
+{
+        lib_ni_t *ni = &nal->ni;
+        int       i;
+        
+        /* Arbitrary choice of hash table size */
+#ifdef __KERNEL__
+        ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head);
+#else
+        ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
+#endif
+        ni->ni_lh_hash_table = 
+                (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size
+                                                    * sizeof (struct list_head));
+        if (ni->ni_lh_hash_table == NULL)
+                return (PTL_NOSPACE);
+        
+        for (i = 0; i < ni->ni_lh_hash_size; i++)
+                INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
+
+        ni->ni_next_object_cookie = 0;
+        
+        return (PTL_OK);
+}
+
+void
+lib_cleanup_handle_hash (nal_cb_t *nal)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->ni_lh_hash_table == NULL)
+                return;
+        
+        nal->cb_free (nal, ni->ni_lh_hash_table,
+                      ni->ni_lh_hash_size * sizeof (struct list_head));
+}
+
+lib_handle_t *
+lib_lookup_cookie (nal_cb_t *nal, __u64 cookie) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t            *ni = &nal->ni;
+        struct list_head    *list;
+        struct list_head    *el;
+        unsigned int         hash;
+
+        hash = ((unsigned int)cookie) % ni->ni_lh_hash_size;
+        list = &ni->ni_lh_hash_table[hash];
+        
+        list_for_each (el, list) {
+                lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain);
+                
+                if (lh->lh_cookie == cookie)
+                        return (lh);
+        }
+        
+        return (NULL);
+}
+
+void
+lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t       *ni = &nal->ni;
+        unsigned int    hash;
+        
+        lh->lh_cookie = ni->ni_next_object_cookie++;
+        hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size;
+        list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]);
+}
+
+void
+lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh)
+{
+        list_del (&lh->lh_hash_chain);
+}
+
+int
+lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+         ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size)
+{
+        int       rc = PTL_OK;
+        lib_ni_t *ni = &nal->ni;
+        int i;
+        ENTRY;
+
+        /* NB serialised in PtlNIInit() */
+
+        if (ni->refcnt != 0) {                       /* already initialised */
+                ni->refcnt++;
+                goto out;
+        }
+
+        /*
+         * Allocate the portal table for this interface
+         * and all per-interface objects.
+         */
+        memset(&ni->counters, 0, sizeof(lib_counters_t));
+
+        rc = kportal_descriptor_setup (nal);
+        if (rc != PTL_OK)
+                goto out;
+
+        INIT_LIST_HEAD (&ni->ni_active_msgs);
+        INIT_LIST_HEAD (&ni->ni_active_mds);
+        INIT_LIST_HEAD (&ni->ni_active_eqs);
+
+        INIT_LIST_HEAD (&ni->ni_test_peers);
+
+        ni->ni_interface_cookie = lib_create_interface_cookie (nal);
+        ni->ni_next_object_cookie = 0;
+        rc = lib_setup_handle_hash (nal);
+        if (rc != PTL_OK)
+                goto out;
+        
+        ni->nid = nid;
+        ni->pid = pid;
+
+        ni->num_nodes = gsize;
+        ni->tbl.size = ptl_size;
+
+        ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size);
+        if (ni->tbl.tbl == NULL) {
+                rc = PTL_NOSPACE;
+                goto out;
+        }
+
+        for (i = 0; i < ptl_size; i++)
+                INIT_LIST_HEAD(&(ni->tbl.tbl[i]));
+
+        ni->debug = PTL_DEBUG_NONE;
+        ni->up = 1;
+        ni->refcnt++;
+
+ out:
+        if (rc != PTL_OK) {
+                lib_cleanup_handle_hash (nal);
+                kportal_descriptor_cleanup (nal);
+        }
+
+        RETURN (rc);
+}
+
+int
+lib_fini(nal_cb_t * nal)
+{
+        lib_ni_t *ni = &nal->ni;
+        int       idx;
+
+        ni->refcnt--;
+
+        if (ni->refcnt != 0)
+                goto out;
+
+        /* NB no stat_lock() since this is the last reference.  The NAL
+         * should have shut down already, so it should be safe to unlink
+         * and free all descriptors, even those that appear committed to a
+         * network op (eg MD with non-zero pending count)
+         */
+
+        for (idx = 0; idx < ni->tbl.size; idx++)
+                while (!list_empty (&ni->tbl.tbl[idx])) {
+                        lib_me_t *me = list_entry (ni->tbl.tbl[idx].next,
+                                                   lib_me_t, me_list);
+
+                        CERROR ("Active me %p on exit\n", me);
+                        list_del (&me->me_list);
+                        lib_me_free (nal, me);
+                }
+
+        while (!list_empty (&ni->ni_active_mds)) {
+                lib_md_t *md = list_entry (ni->ni_active_mds.next,
+                                           lib_md_t, md_list);
+
+                CERROR ("Active md %p on exit\n", md);
+                list_del (&md->md_list);
+                lib_md_free (nal, md);
+        }
+
+        while (!list_empty (&ni->ni_active_eqs)) {
+                lib_eq_t *eq = list_entry (ni->ni_active_eqs.next,
+                                           lib_eq_t, eq_list);
+
+                CERROR ("Active eq %p on exit\n", eq);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+        }
+
+        while (!list_empty (&ni->ni_active_msgs)) {
+                lib_msg_t *msg = list_entry (ni->ni_active_msgs.next,
+                                             lib_msg_t, msg_list);
+
+                CERROR ("Active msg %p on exit\n", msg);
+                list_del (&msg->msg_list);
+                lib_msg_free (nal, msg);
+        }
+
+        nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size);
+        ni->up = 0;
+
+        lib_cleanup_handle_hash (nal);
+        kportal_descriptor_cleanup (nal);
+
+ out:
+        return (PTL_OK);
+}
diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c
new file mode 100644 (file)
index 0000000..d171050
--- /dev/null
@@ -0,0 +1,412 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-md.c
+ * Memory Descriptor management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * must be called with state lock held
+ */
+void lib_md_unlink(nal_cb_t * nal, lib_md_t * md)
+{
+        lib_me_t *me = md->me;
+
+        if (md->pending != 0) {
+                CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+                md->md_flags |= PTL_MD_FLAG_UNLINK;
+                return;
+        }
+
+        CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+        if ((md->options & PTL_MD_KIOV) != 0) {
+                if (nal->cb_unmap_pages != NULL)
+                        nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, 
+                                             &md->md_addrkey);
+        } else if (nal->cb_unmap != NULL)
+                nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, 
+                               &md->md_addrkey);
+
+        if (me) {
+                me->md = NULL;
+                if (me->unlink == PTL_UNLINK)
+                        lib_me_unlink(nal, me);
+        }
+
+        if (md->eq != NULL)
+        {
+                md->eq->eq_refcount--;
+                LASSERT (md->eq->eq_refcount >= 0);
+        }
+
+        lib_invalidate_handle (nal, &md->md_lh);
+        list_del (&md->md_list);
+        lib_md_free(nal, md);
+}
+
+/* must be called with state lock held */
+static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
+                        ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink)
+{
+        const int     max_size_opts = PTL_MD_AUTO_UNLINK |
+                                      PTL_MD_MAX_SIZE;
+        lib_eq_t     *eq = NULL;
+        int           rc;
+        int           i;
+
+        /* NB we are passes an allocated, but uninitialised/active md.
+         * if we return success, caller may lib_md_unlink() it.
+         * otherwise caller may only lib_md_free() it.
+         */
+
+        if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) {
+                eq = ptl_handle2eq(eqh, nal);
+                if (eq == NULL)
+                        return PTL_INV_EQ;
+        }
+
+        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
+            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
+                return PTL_IOV_TOO_MANY;
+
+        if ((md->options & max_size_opts) != 0 && /* max size used */
+            (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
+                return PTL_INV_MD;
+
+        new->me = NULL;
+        new->start = md->start;
+        new->length = md->length;
+        new->offset = 0;
+        new->max_size = md->max_size;
+        new->unlink = unlink;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        new->eq = eq;
+        new->threshold = md->threshold;
+        new->pending = 0;
+        new->md_flags = 0;
+
+        if ((md->options & PTL_MD_IOV) != 0) {
+                int total_length = 0;
+
+                if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */
+                        return PTL_INV_MD; 
+
+                new->md_niov = md->niov;
+                
+                if (nal->cb_read (nal, private, new->md_iov.iov, md->start,
+                                  md->niov * sizeof (new->md_iov.iov[0])))
+                        return PTL_SEGV;
+
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the base address on trust */
+                        if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                                return PTL_VAL_FAILED;
+
+                        total_length += new->md_iov.iov[i].iov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+                
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } else if ((md->options & PTL_MD_KIOV) != 0) {
+#ifndef __KERNEL__
+                return PTL_INV_MD;
+#else
+                int total_length = 0;
+                
+                /* Trap attempt to use paged I/O if unsupported early. */
+                if (nal->cb_send_pages == NULL ||
+                    nal->cb_recv_pages == NULL)
+                        return PTL_INV_MD;
+
+                new->md_niov = md->niov;
+
+                if (nal->cb_read (nal, private, new->md_iov.kiov, md->start,
+                                  md->niov * sizeof (new->md_iov.kiov[0])))
+                        return PTL_SEGV;
+                
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the page pointer on trust */
+                        if (new->md_iov.kiov[i].kiov_offset + 
+                            new->md_iov.kiov[i].kiov_len > PAGE_SIZE )
+                                return PTL_VAL_FAILED; /* invalid length */
+
+                        total_length += new->md_iov.kiov[i].kiov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+
+                if (nal->cb_map_pages != NULL) {
+                        rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, 
+                                                &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+#endif
+        } else {   /* contiguous */
+                new->md_niov = 1;
+                new->md_iov.iov[0].iov_base = md->start;
+                new->md_iov.iov[0].iov_len = md->length;
+
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } 
+
+        if (eq != NULL)
+                eq->eq_refcount++;
+
+        /* It's good; let handle2md succeed and add to active mds */
+        lib_initialise_handle (nal, &new->md_lh);
+        list_add (&new->md_list, &nal->ni.ni_active_mds);
+
+        return PTL_OK;
+}
+
+/* must be called with state lock held */
+void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new)
+{
+        /* NB this doesn't copy out all the iov entries so when a
+         * discontiguous MD is copied out, the target gets to know the
+         * original iov pointer (in start) and the number of entries it had
+         * and that's all.
+         */
+        new->start = md->start;
+        new->length = md->length;
+        new->threshold = md->threshold;
+        new->max_size = md->max_size;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        ptl_eq2handle(&new->eventq, md->eq);
+        new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov;
+}
+
+int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_me_t current_in
+         *      ptl_md_t md_in
+         *      ptl_unlink_t unlink_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDAttach_in *args = v_args;
+        PtlMDAttach_out *ret = v_ret;
+        lib_me_t *me;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->me_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else if (me->md != NULL) {
+                ret->rc = PTL_INUSE;
+        } else {
+                ret->rc = lib_md_build(nal, md, private, &args->md_in,
+                                       &args->eq_in, args->unlink_in);
+
+                if (ret->rc == PTL_OK) {
+                        me->md = md;
+                        md->me = me;
+
+                        ptl_md2handle(&ret->handle_out, md);
+
+                        state_unlock (nal, &flags);
+                        return (PTL_OK);
+                }
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock (nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_md_t md_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDBind_in *args = v_args;
+        PtlMDBind_out *ret = v_ret;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        ret->rc = lib_md_build(nal, md, private,
+                               &args->md_in, &args->eq_in, PTL_UNLINK);
+
+        if (ret->rc == PTL_OK) {
+                ptl_md2handle(&ret->handle_out, md);
+
+                state_unlock(nal, &flags);
+                return (PTL_OK);
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_out *ret = v_ret;
+
+        lib_md_t *md;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                ret->rc = PTL_INV_MD;
+        } else if (md->pending != 0) {           /* being filled/spilled */
+                ret->rc = PTL_MD_INUSE;
+        } else {
+                /* Callers attempting to unlink a busy MD which will get
+                 * unlinked once the net op completes should see INUSE,
+                 * before completion and INV_MD thereafter.  LASSERT we've
+                 * got that right... */
+                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
+
+                lib_md_deconstruct(nal, md, &ret->status_out);
+                lib_md_unlink(nal, md);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
+                            void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         *      ptl_handle_eq_t testq_in
+         *      ptl_seq_t               sequence_in
+         *
+         * Outgoing:
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         */
+        PtlMDUpdate_internal_in *args = v_args;
+        PtlMDUpdate_internal_out *ret = v_ret;
+        lib_md_t *md;
+        lib_eq_t *test_eq = NULL;
+        ptl_md_t *new = &args->new_inout;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                 ret->rc = PTL_INV_MD;
+                 goto out;
+        }
+
+        if (args->old_inout_valid)
+                lib_md_deconstruct(nal, md, &ret->old_inout);
+
+        if (!args->new_inout_valid) {
+                ret->rc = PTL_OK;
+                goto out;
+        }
+
+        if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
+                test_eq = ptl_handle2eq(&args->testq_in, nal);
+                if (test_eq == NULL) {
+                        ret->rc = PTL_INV_EQ;
+                        goto out;
+                }
+        }
+
+        if (md->pending != 0) {
+                        ret->rc = PTL_NOUPDATE;
+                        goto out;
+        }
+
+        if (test_eq == NULL ||
+            test_eq->sequence == args->sequence_in) {
+                lib_me_t *me = md->me;
+
+#warning this does not track eq refcounts properly
+
+                ret->rc = lib_md_build(nal, md, private,
+                                       new, &new->eventq, md->unlink);
+
+                md->me = me;
+        } else {
+                ret->rc = PTL_NOUPDATE;
+        }
+
+ out:
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c
new file mode 100644 (file)
index 0000000..34fb606
--- /dev/null
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-me.c
+ * Match Entry management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me);
+
+int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEAttach_in *args = v_args;
+        PtlMEAttach_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_ptl_t *tbl = &ni->tbl;
+        unsigned long flags;
+        lib_me_t *me;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        /* Should check for valid matchid, but not yet */
+        if (0)
+                return ret->rc = PTL_INV_PROC;
+
+        me = lib_me_alloc (nal);
+        if (me == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me->match_id = args->match_id_in;
+        me->match_bits = args->match_bits_in;
+        me->ignore_bits = args->ignore_bits_in;
+        me->unlink = args->unlink_in;
+        me->md = NULL;
+
+        lib_initialise_handle (nal, &me->me_lh);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&me->me_list, &(tbl->tbl[args->index_in]));
+        else
+                list_add(&me->me_list, &(tbl->tbl[args->index_in]));
+
+        ptl_me2handle(&ret->handle_out, me);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEInsert_in *args = v_args;
+        PtlMEInsert_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+        lib_me_t *new;
+
+        new = lib_me_alloc (nal);
+        if (new == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        /* Should check for valid matchid, but not yet */
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                lib_me_free (nal, new);
+
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_INV_ME);
+        }
+
+        new->match_id = args->match_id_in;
+        new->match_bits = args->match_bits_in;
+        new->ignore_bits = args->ignore_bits_in;
+        new->unlink = args->unlink_in;
+        new->md = NULL;
+
+        lib_initialise_handle (nal, &new->me_lh);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&new->me_list, &me->me_list);
+        else
+                list_add(&new->me_list, &me->me_list);
+
+        ptl_me2handle(&ret->handle_out, new);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEUnlink_in *args = v_args;
+        PtlMEUnlink_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_unlink(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+/* call with state_lock please */
+void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->debug & PTL_DEBUG_UNLINK) {
+                ptl_handle_any_t handle;
+                ptl_me2handle(&handle, me);
+        }
+
+        list_del (&me->me_list);
+
+        if (me->md) {
+                me->md->me = NULL;
+                lib_md_unlink(nal, me->md);
+        }
+
+        lib_invalidate_handle (nal, &me->me_lh);
+        lib_me_free(nal, me);
+}
+
+int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlTblDump_in *args = v_args;
+        PtlTblDump_out *ret = v_ret;
+        lib_ptl_t *tbl = &nal->ni.tbl;
+        ptl_handle_any_t handle;
+        struct list_head *tmp;
+        unsigned long flags;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        nal->cb_printf(nal, "Portal table index %d\n", args->index_in);
+
+        state_lock(nal, &flags);
+        list_for_each(tmp, &(tbl->tbl[args->index_in])) {
+                lib_me_t *me = list_entry(tmp, lib_me_t, me_list);
+                ptl_me2handle(&handle, me);
+                lib_me_dump(nal, me);
+        }
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEDump_in *args = v_args;
+        PtlMEDump_out *ret = v_ret;
+        lib_me_t *me;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_dump(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return ret->rc;
+}
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me)
+{
+        nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, 
+                       me->me_lh.lh_cookie);
+
+        nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n",
+                       me->match_bits, me->ignore_bits);
+
+        nal->cb_printf(nal, "\tMD\t= %p\n", me->md);
+        nal->cb_printf(nal, "\tprev\t= %p\n",
+                       list_entry(me->me_list.prev, lib_me_t, me_list));
+        nal->cb_printf(nal, "\tnext\t= %p\n",
+                       list_entry(me->me_list.next, lib_me_t, me_list));
+}
diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c
new file mode 100644 (file)
index 0000000..7ba1664
--- /dev/null
@@ -0,0 +1,1287 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-move.c
+ * Data movement routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * Right now it does not check access control lists.
+ *
+ * We only support one MD per ME, which is how the Portals 3.1 spec is written.
+ * All previous complication is removed.
+ */
+
+static lib_me_t *
+lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
+            ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset,
+            ptl_match_bits_t match_bits, ptl_size_t *mlength_out,
+            ptl_size_t *offset_out, int *unlink_out)
+{
+        lib_ni_t         *ni = &nal->ni;
+        struct list_head *match_list = &ni->tbl.tbl[index];
+        struct list_head *tmp;
+        lib_me_t         *me;
+        lib_md_t         *md;
+        ptl_size_t        mlength;
+        ptl_size_t        offset;
+
+        ENTRY;
+
+        CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
+                "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
+
+        if (index < 0 || index >= ni->tbl.size) {
+                CERROR("Invalid portal %d not in [0-%d]\n",
+                       index, ni->tbl.size);
+                goto failed;
+        }
+
+        list_for_each (tmp, match_list) {
+                me = list_entry(tmp, lib_me_t, me_list);
+                md = me->md;
+
+                 /* ME attached but MD not attached yet */
+                if (md == NULL)
+                        continue;
+
+                LASSERT (me == md->me);
+
+                /* MD deactivated */
+                if (md->threshold == 0)
+                        continue;
+
+                /* mismatched MD op */
+                if ((md->options & op_mask) == 0)
+                        continue;
+
+                /* mismatched ME nid/pid? */
+                if (me->match_id.nid != PTL_NID_ANY &&
+                    me->match_id.nid != src_nid)
+                        continue;
+
+                if (me->match_id.pid != PTL_PID_ANY &&
+                    me->match_id.pid != src_pid)
+                        continue;
+
+                /* mismatched ME matchbits? */
+                if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0)
+                        continue;
+
+                /* Hurrah! This _is_ a match; check it out... */
+
+                if ((md->options & PTL_MD_MANAGE_REMOTE) == 0)
+                        offset = md->offset;
+                else
+                        offset = roffset;
+
+                mlength = md->length - offset;
+                if ((md->options & PTL_MD_MAX_SIZE) != 0 &&
+                    mlength > md->max_size)
+                        mlength = md->max_size;
+
+                if (rlength <= mlength) {        /* fits in allowed space */
+                        mlength = rlength;
+                } else if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        /* this packet _really_ is too big */
+                        CERROR("Matching packet %d too big: %d left, "
+                               "%d allowed\n", rlength, md->length - offset,
+                               mlength);
+                        goto failed;
+                }
+
+                md->offset = offset + mlength;
+
+                *offset_out = offset;
+                *mlength_out = mlength;
+                *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 &&
+                               md->offset >= (md->length - md->max_size));
+                RETURN (me);
+        }
+
+ failed:
+        CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
+                " offset %d length %d: no match\n",
+                ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
+                src_nid, src_pid, index, match_bits, roffset, rlength);
+        RETURN(NULL);
+}
+
+int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+{
+        PtlFailNid_in     *args = v_args;
+        PtlFailNid_out    *ret  = v_ret;
+        lib_test_peer_t   *tp;
+        unsigned long      flags;
+        struct list_head  *el;
+        struct list_head  *next;
+        struct list_head   cull;
+        
+        if (args->threshold != 0) {
+                /* Adding a new entry */
+                tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp));
+                if (tp == NULL)
+                        return (ret->rc = PTL_FAIL);
+                
+                tp->tp_nid = args->nid;
+                tp->tp_threshold = args->threshold;
+                
+                state_lock (nal, &flags);
+                list_add (&tp->tp_list, &nal->ni.ni_test_peers);
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_OK);
+        }
+        
+        /* removing entries */
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+                
+                if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                    args->nid == PTL_NID_ANY || /* removing all entries */
+                    tp->tp_nid == args->nid)    /* matched this one */
+                {
+                        list_del (&tp->tp_list);
+                        list_add (&tp->tp_list, &cull);
+                }
+        }
+        
+        state_unlock (nal, &flags);
+                
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+
+                list_del (&tp->tp_list);
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+        return (ret->rc = PTL_OK);
+}
+
+static int
+fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) 
+{
+        lib_test_peer_t  *tp;
+        struct list_head *el;
+        struct list_head *next;
+        unsigned long     flags;
+        struct list_head  cull;
+        int               fail = 0;
+
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+
+                if (tp->tp_threshold == 0) {
+                        /* zombie entry */
+                        if (outgoing) {
+                                /* only cull zombies on outgoing tests,
+                                 * since we may be at interrupt priority on
+                                 * incoming messages. */
+                                list_del (&tp->tp_list);
+                                list_add (&tp->tp_list, &cull);
+                        }
+                        continue;
+                }
+                        
+                if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */
+                    nid == tp->tp_nid) {        /* fail this peer */
+                        fail = 1;
+                        
+                        if (tp->tp_threshold != PTL_MD_THRESH_INF) {
+                                tp->tp_threshold--;
+                                if (outgoing &&
+                                    tp->tp_threshold == 0) {
+                                        /* see above */
+                                        list_del (&tp->tp_list);
+                                        list_add (&tp->tp_list, &cull);
+                                }
+                        }
+                        break;
+                }
+        }
+        
+        state_unlock (nal, &flags);
+
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                list_del (&tp->tp_list);
+                
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+
+        return (fail);
+}
+
+ptl_size_t
+lib_iov_nob (int niov, struct iovec *iov)
+{
+        ptl_size_t nob = 0;
+        
+        while (niov-- > 0)
+                nob += (iov++)->iov_len;
+        
+        return (nob);
+}
+
+void
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (dest, iov->iov_base, nob);
+
+                len -= nob;
+                dest += nob;
+                niov--;
+                iov++;
+        }
+}
+
+void
+lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (iov->iov_base, src, nob);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                iov++;
+        }
+}
+
+static int
+lib_extract_iov (struct iovec *dst, lib_md_t *md,
+                 ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        struct iovec   *src = md->md_iov.iov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->iov_len) {      /* skip initial frags */
+                offset -= src->iov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->iov_len - offset;
+                dst->iov_base = ((char *)src->iov_base) + offset;
+
+                if (len <= frag_len) {
+                        dst->iov_len = len;
+                        return (dst_niov);
+                }
+                
+                dst->iov_len = frag_len;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+
+#ifndef __KERNEL__
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        LASSERT (0);
+        return (0);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+#else
+
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        ptl_size_t  nob = 0;
+
+        while (niov-- > 0)
+                nob += (kiov++)->kiov_len;
+
+        return (nob);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+        
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (dest, addr, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                dest += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (addr, src, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        ptl_kiov_t     *src = md->md_iov.kiov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->kiov_len) {      /* skip initial frags */
+                offset -= src->kiov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->kiov_len - offset;
+                dst->kiov_page = src->kiov_page;
+                dst->kiov_offset = src->kiov_offset + offset;
+
+                if (len <= frag_len) {
+                        dst->kiov_len = len;
+                        LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                        return (dst_niov);
+                }
+
+                dst->kiov_len = frag_len;
+                LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+#endif
+
+void
+lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+          ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+{
+        int   niov;
+
+        if (mlen == 0)
+                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
+        else if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
+                nal->cb_recv (nal, private, msg,
+                              niov, msg->msg_iov.iov, mlen, rlen);
+        } else {
+                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
+                nal->cb_recv_pages (nal, private, msg, 
+                                    niov, msg->msg_iov.kiov, mlen, rlen);
+        }
+}
+
+int
+lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+          ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+          lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
+{
+        int   niov;
+
+        if (len == 0)
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      0, NULL, 0));
+        
+        if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      niov, msg->msg_iov.iov, len));
+        }
+
+        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
+        return (nal->cb_send_pages (nal, private, msg, 
+                                    hdr, type, nid, pid,
+                                    niov, msg->msg_iov.kiov, len));
+}
+
+static lib_msg_t *
+get_new_msg (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called holding the state_lock */
+        lib_counters_t *counters = &nal->ni.counters;
+        lib_msg_t      *msg      = lib_msg_alloc (nal);
+
+        if (msg == NULL)
+                return (NULL);
+
+        memset (msg, 0, sizeof (*msg));
+
+        msg->send_ack = 0;
+
+        msg->md = md;
+        msg->ev.arrival_time = get_cycles();
+        md->pending++;
+        if (md->threshold != PTL_MD_THRESH_INF) {
+                LASSERT (md->threshold > 0);
+                md->threshold--;
+        }
+
+        counters->msgs_alloc++;
+        if (counters->msgs_alloc > counters->msgs_max)
+                counters->msgs_max = counters->msgs_alloc;
+
+        list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+
+        return (msg);
+}
+
+
+/*
+ * Incoming messages have a ptl_msg_t object associated with them
+ * by the library.  This object encapsulates the state of the
+ * message and allows the NAL to do non-blocking receives or sends
+ * of long messages.
+ *
+ */
+static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* Convert put fields to host byte order */
+        hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
+                         hdr->src_nid, hdr->src_pid,
+                         PTL_HDR_LENGTH (hdr), hdr->msg.put.offset,
+                         hdr->msg.put.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
+               "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+            !(md->options & PTL_MD_ACK_DISABLE)) {
+                msg->send_ack = 1;
+                msg->ack_wmd = hdr->msg.put.ack_wmd;
+                msg->nid = hdr->src_nid;
+                msg->pid = hdr->src_pid;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_PUT;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.put.ptl_index;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += mlength;
+
+        /* only unlink after MD's pending count has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        ptl_hdr_t        reply;
+        unsigned long    flags;
+        int              rc;
+
+        /* Convert get fields to host byte order */
+        hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits);
+        hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index);
+        hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length);
+        hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset);
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.get.return_offset != 0)
+                CERROR("Unexpected non-zero get.return_offset %x from "
+                       LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
+                         hdr->src_nid, hdr->src_pid,
+                         hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                         hdr->msg.get.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
+               "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_GET;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.get.ptl_index;
+                msg->ev.match_bits = hdr->msg.get.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = 0;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.send_count++;
+        ni->counters.send_length += mlength;
+
+        /* only unlink after MD's refcount has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        memset (&reply, 0, sizeof (reply));
+        reply.type     = HTON__u32 (PTL_MSG_REPLY);
+        reply.dest_nid = HTON__u64 (hdr->src_nid);
+        reply.src_nid  = HTON__u64 (ni->nid);
+        reply.dest_pid = HTON__u32 (hdr->src_pid);
+        reply.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength);
+
+        reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
+
+        rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
+                       hdr->src_nid, hdr->src_pid, md, offset, mlength);
+        if (rc != 0) {
+                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
+                       ni->nid, hdr->src_nid);
+                state_lock (nal, &flags);
+                goto drop;
+        }
+
+        /* Complete the incoming message */
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return (rc);
+ drop:
+        ni->counters.drop_count++;
+        ni->counters.drop_length += hdr->msg.get.sink_length;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        lib_md_t        *md;
+        int              rlength;
+        int              length;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.reply.dst_offset != 0)
+                CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n",
+                       hdr->msg.reply.dst_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
+                        ni->nid, hdr->src_nid,
+                        md == NULL ? "invalid" : "inactive",
+                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                        hdr->msg.reply.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        LASSERT (md->offset == 0);
+
+        length = rlength = PTL_HDR_LENGTH(hdr);
+
+        if (length > md->length) {
+                if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        CERROR (LPU64": Dropping REPLY from "LPU64
+                                " length %d for MD "LPX64" would overflow (%d)\n",
+                                ni->nid, hdr->src_nid, length,
+                                hdr->msg.reply.dst_wmd.wh_object_cookie,
+                                md->length);
+                        goto drop;
+                }
+                length = md->length;
+        }
+
+        CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n",
+               hdr->src_nid, length, rlength, 
+               hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
+                       "allocate msg\n", ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_REPLY;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.rlength = rlength;
+                msg->ev.mlength = length;
+                msg->ev.offset = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += length;
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, 0, length, rlength);
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        unsigned long flags;
+
+        /* Convert ack fields to host byte order */
+        hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
+        hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR(LPU64": Dropping ACK from "LPU64" to %s MD "
+                       LPX64"."LPX64"\n", ni->nid, hdr->src_nid, 
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
+               ni->nid, hdr->src_nid, 
+               hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_ACK;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.mlength = hdr->msg.ack.mlength;
+                msg->ev.match_bits = hdr->msg.ack.match_bits;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        case PTL_MSG_HELLO:
+                return ("HELLO");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str);
+        nal->cb_printf(nal, "    From nid/pid %Lu/%Lu", hdr->src_nid,
+                       hdr->src_pid);
+        nal->cb_printf(nal, "    To nid/pid %Lu/%Lu\n", hdr->dest_nid,
+                       hdr->dest_pid);
+
+        switch (hdr->type) {
+        default:
+                break;
+
+        case PTL_MSG_PUT:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, ack md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n",
+                               hdr->msg.put.ptl_index,
+                               hdr->msg.put.ack_wmd.wh_interface_cookie,
+                               hdr->msg.put.ack_wmd.wh_object_cookie,
+                               hdr->msg.put.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, offset %d, hdr data "LPX64"\n",
+                               PTL_HDR_LENGTH(hdr), hdr->msg.put.offset,
+                               hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, return md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                               hdr->msg.get.return_wmd.wh_interface_cookie,
+                               hdr->msg.get.return_wmd.wh_object_cookie,
+                               hdr->msg.get.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, src offset %d\n",
+                               hdr->msg.get.sink_length,
+                               hdr->msg.get.src_offset);
+                break;
+
+        case PTL_MSG_ACK:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "manipulated length %d\n",
+                               hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                               hdr->msg.ack.dst_wmd.wh_object_cookie,
+                               hdr->msg.ack.mlength);
+                break;
+
+        case PTL_MSG_REPLY:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "length %d\n",
+                               hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                               hdr->msg.reply.dst_wmd.wh_object_cookie,
+                               PTL_HDR_LENGTH(hdr));
+        }
+
+}                               /* end of print_hdr() */
+
+
+int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        unsigned long  flags;
+
+        /* NB static check; optimizer will elide this if it's right */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.put.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.get.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.reply.length));
+
+        /* convert common fields to host byte order */
+        hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
+        hdr->src_nid = NTOH__u64 (hdr->src_nid);
+        hdr->dest_pid = NTOH__u32 (hdr->dest_pid);
+        hdr->src_pid = NTOH__u32 (hdr->src_pid);
+        hdr->type = NTOH__u32 (hdr->type);
+        PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr));
+#if 0
+        nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n",
+                       nal->ni.nid, nal, hdr, hdr->type);
+        print_hdr(nal, hdr);
+#endif
+        if (hdr->type == PTL_MSG_HELLO) {
+                /* dest_nid is really ptl_magicversion_t */
+                ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
+
+                CERROR (LPU64": Dropping unexpected HELLO message: "
+                        "magic %d, version %d.%d from "LPD64"\n",
+                        nal->ni.nid, mv->magic, 
+                        mv->version_major, mv->version_minor,
+                        hdr->src_nid);
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+        
+        if (hdr->dest_nid != nal->ni.nid) {
+                CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
+                       " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
+                       hdr->src_nid, hdr->dest_nid);
+
+                state_lock (nal, &flags);
+                nal->ni.counters.drop_count++;
+                nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+                state_unlock (nal, &flags);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
+        {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": simulated failure\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                return (-1);
+        }
+        
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return (parse_ack(nal, hdr, private));
+        case PTL_MSG_PUT:
+                return (parse_put(nal, hdr, private));
+                break;
+        case PTL_MSG_GET:
+                return (parse_get(nal, hdr, private));
+                break;
+        case PTL_MSG_REPLY:
+                return (parse_reply(nal, hdr, private));
+                break;
+        default:
+                CERROR(LPU64": Dropping <unknown> message from "LPU64
+                       ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
+                       hdr->type);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+}
+
+
+int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_ack_req_t ack_req_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlPut_in *args = v_args;
+        PtlPut_out *ret = v_ret;
+        ptl_hdr_t hdr;
+
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        ptl_process_id_t *id = &args->target_in;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        ret->rc = PTL_OK;
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_PUT);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length);
+
+        /* NB handles only looked up by creator (no flips) */
+        if (args->ack_req_in == PTL_ACK_REQ) {
+                hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+                hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        } else {
+                hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+
+        hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.put.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.put.offset = HTON__u32 (args->offset_in);
+        hdr.msg.put.hdr_data = args->hdr_data_in;
+
+        ni->counters.send_count++;
+        ni->counters.send_length += md->length;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("BAD: could not allocate msg!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we need to allocate a message state object and record the
+         * information about this operation that will be recorded into
+         * event queue once the message has been completed.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = args->hdr_data_in;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+        
+        lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
+                  id->nid, id->pid, md, 0, md->length);
+
+        return ret->rc = PTL_OK;
+}
+
+
+int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlGet_in *args = v_args;
+        PtlGet_out *ret = v_ret;
+        ptl_hdr_t hdr;
+        lib_msg_t *msg = NULL;
+        lib_ni_t *ni = &nal->ni;
+        ptl_process_id_t *id = &args->target_in;
+        lib_md_t *md;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        LASSERT (md->offset == 0);
+
+        CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_GET);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = 0;
+
+        /* NB handles only looked up by creator (no flips) */
+        hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+        hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+
+        hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.get.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
+        hdr.msg.get.sink_length = HTON__u32 (md->length);
+
+        ni->counters.send_count++;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we must allocate a message state object that will record
+         * the information to be filled in once the message has been
+         * completed.  More information is in the do_PtlPut() comments.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
+                  id->nid, id->pid, NULL, 0, 0);
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c
new file mode 100644 (file)
index 0000000..20a6c66
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-msg.c
+ * Message decoding, parsing and finalizing routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+
+int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+{
+        lib_md_t     *md;
+        lib_eq_t     *eq;
+        int           rc;
+        unsigned long flags;
+
+        /* ni went down while processing this message */
+        if (nal->ni.up == 0) {
+                return -1;
+        }
+
+        if (msg == NULL)
+                return 0;
+
+        rc = 0;
+        if (msg->send_ack) {
+                ptl_hdr_t ack;
+
+                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+
+                memset (&ack, 0, sizeof (ack));
+                ack.type     = HTON__u32 (PTL_MSG_ACK);
+                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.src_nid  = HTON__u64 (nal->ni.nid);
+                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.src_pid  = HTON__u32 (nal->ni.pid);
+                PTL_HDR_LENGTH(&ack) = 0;
+
+                ack.msg.ack.dst_wmd = msg->ack_wmd;
+                ack.msg.ack.match_bits = msg->ev.match_bits;
+                ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
+
+                rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
+                               msg->nid, msg->pid, NULL, 0, 0);
+        }
+
+        md = msg->md;
+        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
+        eq = md->eq;
+
+        state_lock(nal, &flags);
+
+        if (eq != NULL) {
+                ptl_event_t  *ev = &msg->ev;
+                ptl_event_t  *eq_slot;
+
+                /* I have to hold the lock while I bump the sequence number
+                 * and copy the event into the queue.  If not, and I was
+                 * interrupted after bumping the sequence number, other
+                 * events could fill the queue, including the slot I just
+                 * allocated to this event.  On resuming, I would overwrite
+                 * a more 'recent' event with old event state, and
+                 * processes taking events off the queue would not detect
+                 * overflow correctly.
+                 */
+
+                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
+
+                /* size must be a power of 2 to handle a wrapped sequence # */
+                LASSERT (eq->size != 0 &&
+                         eq->size == LOWEST_BIT_SET (eq->size));
+                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+                /* Invalidate unlinked_me unless this is the last
+                 * event for an auto-unlinked MD.  Note that if md was
+                 * auto-unlinked, md->pending can only decrease
+                 */
+                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
+                    md->pending != 1)                       /* not last ref */
+                        ev->unlinked_me = PTL_HANDLE_NONE;
+
+                /* Copy the event into the allocated slot, ensuring all the
+                 * rest of the event's contents have been copied _before_
+                 * the sequence number gets updated.  A processes 'getting'
+                 * an event waits on the next queue slot's sequence to be
+                 * 'new'.  When it is, _all_ other event fields had better
+                 * be consistent.  I assert 'sequence' is the last member,
+                 * so I only need a 2 stage copy.
+                 */
+                LASSERT(sizeof (ptl_event_t) ==
+                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                                    offsetof (ptl_event_t, sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+                /* Updating the sequence number is what makes the event 'new' */
+
+                /* cb_write is not necessarily atomic, so this could
+                   cause a race with PtlEQGet */
+                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                                   (void *)&ev->sequence,sizeof (ev->sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+
+                /* I must also ensure that (a) callbacks are made in the
+                 * same order as the events land in the queue, and (b) the
+                 * callback occurs before the event can be removed from the
+                 * queue, so I can't drop the lock during the callback. */
+                if (nal->cb_callback != NULL)
+                        nal->cb_callback(nal, private, eq, ev);
+                else  if (eq->event_callback != NULL)
+                        (void)((eq->event_callback) (ev));
+        }
+
+        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
+                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+
+        md->pending--;
+        if (md->pending == 0 && /* no more outstanding operations on this md */
+            (md->threshold == 0 ||              /* done its business */
+             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+                lib_md_unlink(nal, md);
+
+        list_del (&msg->msg_list);
+        nal->ni.counters.msgs_alloc--;
+        lib_msg_free(nal, msg);
+
+        state_unlock(nal, &flags);
+
+        return rc;
+}
diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c
new file mode 100644 (file)
index 0000000..37dcb91
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-ni.c
+ * Network status registers and distance functions.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+#define MAX_DIST 18446744073709551615UL
+
+int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlNIDebug_in *args = v_args;
+        PtlNIDebug_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->rc = ni->debug;
+        ni->debug = args->mask_in;
+
+        return 0;
+}
+
+int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_sr_index_t register_in
+         *
+         * Outgoing:
+         *      ptl_sr_value_t          * status_out
+         */
+
+        PtlNIStatus_in *args = v_args;
+        PtlNIStatus_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_counters_t *count = &ni->counters;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        ret->rc = PTL_OK;
+        ret->status_out = 0;
+
+        /*
+         * I hate this sort of code....  Hash tables, offset lists?
+         * Treat the counters as an array of ints?
+         */
+        if (args->register_in == PTL_SR_DROP_COUNT)
+                ret->status_out = count->drop_count;
+
+        else if (args->register_in == PTL_SR_DROP_LENGTH)
+                ret->status_out = count->drop_length;
+
+        else if (args->register_in == PTL_SR_RECV_COUNT)
+                ret->status_out = count->recv_count;
+
+        else if (args->register_in == PTL_SR_RECV_LENGTH)
+                ret->status_out = count->recv_length;
+
+        else if (args->register_in == PTL_SR_SEND_COUNT)
+                ret->status_out = count->send_count;
+
+        else if (args->register_in == PTL_SR_SEND_LENGTH)
+                ret->status_out = count->send_length;
+
+        else if (args->register_in == PTL_SR_MSGS_MAX)
+                ret->status_out = count->msgs_max;
+        else
+                ret->rc = PTL_INV_SR_INDX;
+
+        return ret->rc;
+}
+
+
+int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_process_id_t process_in
+
+         *
+         * Outgoing:
+         *      unsigned long   * distance_out
+
+         */
+
+        PtlNIDist_in *args = v_args;
+        PtlNIDist_out *ret = v_ret;
+
+        unsigned long dist;
+        ptl_process_id_t id_in = args->process_in;
+        ptl_nid_t nid;
+        int rc;
+
+        nid = id_in.nid;
+
+        if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) {
+                ret->distance_out = (unsigned long) MAX_DIST;
+                return PTL_INV_PROC;
+        }
+
+        ret->distance_out = dist;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lustre/portals/portals/lib-not-impl.c b/lustre/portals/portals/lib-not-impl.c
new file mode 100644 (file)
index 0000000..78959b2
--- /dev/null
@@ -0,0 +1,37 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-not-impl.c
+ *
+ * boiler plate functions that can be used to write the 
+ * library side routines
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+
+int do_PtlACEntry(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_ac_index_t index_in
+         *      ptl_process_id_t match_id_in
+         *      ptl_pt_index_t portal_in
+
+         *
+         * Outgoing:
+
+         */
+
+        PtlACEntry_in *args = v_args;
+        PtlACEntry_out *ret = v_ret;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        return ret->rc = PTL_NOT_IMPLEMENTED;
+}
diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c
new file mode 100644 (file)
index 0000000..e00e9f0
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-pid.c
+ * Process identification routines
+ */
+
+/* This should be removed.  The NAL should have the PID information */
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#if defined (__KERNEL__)
+#       include <linux/kernel.h>
+extern int getpid(void);
+#else
+#       include <stdio.h>
+#       include <unistd.h>
+#endif
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t handle_in
+         *
+         * Outgoing:
+         *      ptl_process_id_t        * id_out
+         *      ptl_id_t                * gsize_out
+         */
+
+        PtlGetId_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->id_out.nid = ni->nid;
+        ret->id_out.pid = ni->pid;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lustre/portals/router/Makefile.am b/lustre/portals/router/Makefile.am
new file mode 100644 (file)
index 0000000..1c8087b
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+MODULE = kptlrouter
+modulenet_DATA = kptlrouter.o
+EXTRA_PROGRAMS = kptlrouter
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kptlrouter_SOURCES = router.c proc.c router.h
diff --git a/lustre/portals/router/Makefile.mk b/lustre/portals/router/Makefile.mk
new file mode 100644 (file)
index 0000000..64bd09b
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += kptlrouter.o
+kptlrouter-objs    := router.o proc.o
diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c
new file mode 100644 (file)
index 0000000..dd65b34
--- /dev/null
@@ -0,0 +1,78 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+#define KPR_PROC_ROUTER "sys/portals/router"
+
+int
+kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+       unsigned long long bytes = kpr_fwd_bytes;
+       unsigned long      packets = kpr_fwd_packets;
+       unsigned long      errors = kpr_fwd_errors;
+        unsigned int       qdepth = atomic_read (&kpr_queue_depth);
+       int                len;
+       
+       *eof = 1;
+       if (off != 0)
+               return (0);
+       
+       len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth);
+       
+       *start = page;
+       return (len);
+}
+
+int
+kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data)
+{
+       /* Ignore what we've been asked to write, and just zero the stats counters */
+       kpr_fwd_bytes = 0;
+       kpr_fwd_packets = 0;
+       kpr_fwd_errors = 0;
+
+       return (count);
+}
+
+void
+kpr_proc_init(void)
+{
+        struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL);
+
+        if (entry == NULL) 
+       {
+                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER);
+                return;
+        }
+
+        entry->data = NULL;
+        entry->read_proc = kpr_proc_read;
+       entry->write_proc = kpr_proc_write;
+}
+
+void 
+kpr_proc_fini(void)
+{
+        remove_proc_entry(KPR_PROC_ROUTER, 0);
+}
diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c
new file mode 100644 (file)
index 0000000..8a1de08
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+struct list_head kpr_routes;
+struct list_head kpr_nals;
+
+unsigned long long kpr_fwd_bytes;
+unsigned long      kpr_fwd_packets;
+unsigned long      kpr_fwd_errors;
+atomic_t           kpr_queue_depth;
+
+/* Mostly the tables are read-only (thread and interrupt context)
+ *
+ * Once in a blue moon we register/deregister NALs and add/remove routing
+ * entries (thread context only)... */
+rwlock_t         kpr_rwlock;
+
+kpr_router_interface_t kpr_router_interface = {
+       kprri_register:         kpr_register_nal,
+       kprri_lookup:           kpr_lookup_target,
+       kprri_fwd_start:        kpr_forward_packet,
+       kprri_fwd_done:         kpr_complete_packet,
+       kprri_shutdown:         kpr_shutdown_nal,
+       kprri_deregister:       kpr_deregister_nal,
+};
+
+kpr_control_interface_t kpr_control_interface = {
+       kprci_add_route:        kpr_add_route,
+       kprci_del_route:        kpr_del_route,
+       kprci_get_route:        kpr_get_route,
+};
+
+int
+kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_nal_entry_t   *ne;
+
+        CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+
+       PORTAL_ALLOC (ne, sizeof (*ne));
+       if (ne == NULL)
+               return (-ENOMEM);
+
+       memset (ne, 0, sizeof (*ne));
+        memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
+
+       LASSERT (!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+       {
+               kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+               if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
+               {
+                       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                       CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid);
+
+                       PORTAL_FREE (ne, sizeof (*ne));
+                       return (-EEXIST);
+               }
+       }
+
+        list_add (&ne->kpne_list, &kpr_nals);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       *argp = ne;
+       PORTAL_MODULE_USE;
+        return (0);
+}
+
+void
+kpr_shutdown_nal (void *arg)
+{
+       long             flags;
+       kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (!ne->kpne_shutdown);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
+       ne->kpne_shutdown = 1;
+       write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
+
+       while (atomic_read (&ne->kpne_refcount) != 0)
+       {
+               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
+                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
+
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+}
+
+void
+kpr_deregister_nal (void *arg)
+{
+       long              flags;
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
+       LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       list_del (&ne->kpne_list);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       PORTAL_FREE (ne, sizeof (*ne));
+        PORTAL_MODULE_UNUSE;
+}
+
+
+int
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+{
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+       struct list_head *e;
+       int               rc = -ENOENT;
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+
+       if (ne->kpne_shutdown)          /* caller is shutting down */
+               return (-ENOENT);
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid on the callers network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid ||
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+               /* found table entry */
+
+               if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
+                       rc = -EHOSTUNREACH;
+               else
+               {
+                       rc = 0;
+                       *gateway_nidp = re->kpre_gateway_nid;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+                target_nid, ne->kpne_interface.kprni_nalid, rc,
+                (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
+       return (rc);
+}
+
+void
+kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+       kpr_nal_entry_t  *src_ne = (kpr_nal_entry_t *)arg;
+       ptl_nid_t         target_nid = fwd->kprfd_target_nid;
+        int               nob = fwd->kprfd_nob;
+       struct list_head *e;
+
+        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+        LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
+        LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov));
+        
+        atomic_inc (&kpr_queue_depth);
+
+        kpr_fwd_packets++;                   /* (loose) stats accounting */
+        kpr_fwd_bytes += nob;
+
+       if (src_ne->kpne_shutdown)                      /* caller is shutting down */
+               goto out;
+
+       fwd->kprfd_router_arg = src_ne;         /* stash caller's nal entry */
+       atomic_inc (&src_ne->kpne_refcount);    /* source nal is busy until fwd completes */
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid || /* no match */
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+                CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
+                        target_nid, src_ne->kpne_interface.kprni_nalid,
+                        re->kpre_gateway_nid, re->kpre_gateway_nalid);
+
+               if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
+                       break;                  /* don't route to same NAL */
+
+               /* Search for gateway's NAL's entry */
+
+               for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+               {
+                       kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+                       if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
+                               continue;
+
+                       if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
+                               break;
+
+                       fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
+                       atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+
+                       read_unlock (&kpr_rwlock);
+
+                        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
+                                target_nid, src_ne->kpne_interface.kprni_nalid,
+                                fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+
+                       dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+                       return;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+ out:
+        kpr_fwd_errors++;
+
+        CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+       /* Can't find anywhere to forward to */
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);
+}
+
+void
+kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
+{
+       kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
+       kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
+
+        CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
+
+       atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
+
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
+
+        CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, error);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
+}
+
+int
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+               ptl_nid_t hi_nid)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_route_entry_t *re;
+
+        CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+               gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        LASSERT(lo_nid <= hi_nid);
+
+        PORTAL_ALLOC (re, sizeof (*re));
+        if (re == NULL)
+                return (-ENOMEM);
+
+        re->kpre_gateway_nalid = gateway_nalid;
+        re->kpre_gateway_nid = gateway_nid;
+        re->kpre_lo_nid = lo_nid;
+        re->kpre_hi_nid = hi_nid;
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
+                                                    kpre_list);
+
+                if (re->kpre_lo_nid > re2->kpre_hi_nid ||
+                    re->kpre_hi_nid < re2->kpre_lo_nid)
+                        continue;
+
+                CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
+                        "to ["LPX64" - "LPX64"]\n",
+                        re->kpre_lo_nid, re->kpre_hi_nid,
+                        re2->kpre_lo_nid, re2->kpre_hi_nid);
+
+                write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                PORTAL_FREE (re, sizeof (*re));
+                return (-EINVAL);
+        }
+
+        list_add (&re->kpre_list, &kpr_routes);
+
+        write_unlock_irqrestore (&kpr_rwlock, flags);
+        return (0);
+}
+
+int
+kpr_del_route (ptl_nid_t nid)
+{
+       long               flags;
+       struct list_head  *e;
+
+        CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave(&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+                        continue;
+
+                list_del (&re->kpre_list);
+                write_unlock_irqrestore(&kpr_rwlock, flags);
+
+                PORTAL_FREE(re, sizeof (*re));
+                return (0);
+        }
+
+        write_unlock_irqrestore(&kpr_rwlock, flags);
+        return (-ENOENT);
+}
+
+int
+kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+              ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+{
+       struct list_head  *e;
+
+       read_lock(&kpr_rwlock);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (idx-- == 0) {
+                        *gateway_nalid = re->kpre_gateway_nalid;
+                        *gateway_nid = re->kpre_gateway_nid;
+                        *lo_nid = re->kpre_lo_nid;
+                        *hi_nid = re->kpre_hi_nid;
+
+                        read_unlock(&kpr_rwlock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kpr_rwlock);
+        return (-ENOENT);
+}
+
+static void __exit
+kpr_finalise (void)
+{
+        LASSERT (list_empty (&kpr_nals));
+
+        while (!list_empty (&kpr_routes)) {
+                kpr_route_entry_t *re = list_entry(kpr_routes.next,
+                                                   kpr_route_entry_t,
+                                                   kpre_list);
+
+                list_del(&re->kpre_list);
+                PORTAL_FREE(re, sizeof (*re));
+        }
+
+        kpr_proc_fini();
+
+        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
+
+        CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
+               atomic_read(&portal_kmemory));
+}
+
+static int __init
+kpr_initialise (void)
+{
+        CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+       rwlock_init(&kpr_rwlock);
+       INIT_LIST_HEAD(&kpr_routes);
+       INIT_LIST_HEAD(&kpr_nals);
+
+        kpr_proc_init();
+
+        PORTAL_SYMBOL_REGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_REGISTER(kpr_control_interface);
+        return (0);
+}
+
+MODULE_AUTHOR("Eric Barton");
+MODULE_DESCRIPTION("Kernel Portals Router v0.01");
+MODULE_LICENSE("GPL");
+
+module_init (kpr_initialise);
+module_exit (kpr_finalise);
+
+EXPORT_SYMBOL (kpr_control_interface);
+EXPORT_SYMBOL (kpr_router_interface);
diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h
new file mode 100644 (file)
index 0000000..b8c3bec
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _KPTLROUTER_H
+#define _KPTLROUTER_H
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#define DEBUG_SUBSYSTEM S_PTLROUTER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+typedef struct
+{
+       struct list_head        kpne_list;
+       kpr_nal_interface_t     kpne_interface;
+       atomic_t                kpne_refcount;
+       int                     kpne_shutdown;
+} kpr_nal_entry_t;
+
+typedef struct
+{
+       struct list_head        kpre_list;
+       int                     kpre_gateway_nalid;
+       ptl_nid_t               kpre_gateway_nid;
+       ptl_nid_t               kpre_lo_nid;
+        ptl_nid_t               kpre_hi_nid;
+} kpr_route_entry_t;
+
+extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_shutdown_nal (void *arg);
+extern void kpr_deregister_nal (void *arg);
+
+extern void kpr_proc_init (void);
+extern void kpr_proc_fini (void);
+
+extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, 
+                          ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, 
+                          ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+
+extern unsigned long long kpr_fwd_bytes;
+extern unsigned long      kpr_fwd_packets;
+extern unsigned long      kpr_fwd_errors;
+extern atomic_t           kpr_queue_depth;
+
+#endif /* _KPLROUTER_H */
diff --git a/lustre/portals/tests/.cvsignore b/lustre/portals/tests/.cvsignore
new file mode 100644 (file)
index 0000000..051d1bd
--- /dev/null
@@ -0,0 +1,3 @@
+Makefile
+Makefile.in
+.deps
diff --git a/lustre/portals/tests/Makefile.am b/lustre/portals/tests/Makefile.am
new file mode 100644 (file)
index 0000000..7b47ae0
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r
+LINK = $(LD) $(LDFLAGS) -o $@
+DEFS =
+LIBS =
+MODULE = $(basename)
+EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh
+
+noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o 
+
+pingsrv_o_SOURCES = ping_srv.c ping.h
+
+pingcli_o_SOURCES = ping_cli.c ping.h
+
+spingsrv_o_SOURCES = sping_srv.c ping.h
+
+spingcli_o_SOURCES = sping_cli.c ping.h
diff --git a/lustre/portals/tests/ping.h b/lustre/portals/tests/ping.h
new file mode 100644 (file)
index 0000000..f07444b
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _KPING_INCLUDED
+#define _KPING_INCLUDED
+
+#include <portals/p30.h>
+
+
+#define PTL_PING_IN_SIZE               256     // n packets per buffer
+#define PTL_PING_IN_BUFFERS            2       // n fallback buffers
+
+#define PTL_PING_CLIENT                        4
+#define PTL_PING_SERVER                        5
+
+#define PING_HEADER_MAGIC              0xDEADBEEF
+#define PING_BULK_MAGIC                        0xCAFEBABE
+
+#define PING_HEAD_BITS                 0x00000001
+#define PING_BULK_BITS                 0x00000002
+#define PING_IGNORE_BITS               0xFFFFFFFC
+
+#define PTL_PING_ACK                   0x01
+#define PTL_PING_VERBOSE               0x02
+#define PTL_PING_VERIFY                        0x04
+#define PTL_PING_PREALLOC              0x08
+
+
+#define NEXT_PRIMARY_BUFFER(index)             \
+       (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1))
+
+#define PDEBUG(str, err)                       \
+       CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err)
+
+
+/* Ping data to be passed via the ioctl to kernel space */
+
+#if __KERNEL__
+
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+struct pingsrv_data {
+        
+        ptl_handle_ni_t         ni;
+        ptl_handle_me_t         me;
+        ptl_handle_eq_t         eq;
+        void                   *in_buf;
+        ptl_process_id_t        my_id;
+        ptl_process_id_t        id_local;
+        ptl_md_t                mdin;
+        ptl_md_t                mdout;
+        ptl_handle_md_t         mdin_h;
+        ptl_handle_md_t         mdout_h;
+        ptl_event_t             evnt;
+        struct task_struct     *tsk;
+}; /* struct pingsrv_data */
+struct pingcli_data {
+        
+        struct portal_ioctl_data *args;
+        ptl_handle_me_t        me;
+        ptl_handle_eq_t                eq;
+        char                          *inbuf;    
+        char                   *outbuf;   
+        ptl_process_id_t       myid; 
+        ptl_process_id_t       id_local; 
+        ptl_process_id_t       id_remote;
+        ptl_md_t               md_in_head;
+        ptl_md_t               md_out_head;
+        ptl_handle_md_t        md_in_head_h;
+        ptl_handle_md_t        md_out_head_h;
+        ptl_event_t            ev;
+        struct task_struct     *tsk;
+}; /* struct pingcli_data */
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _KPING_INCLUDED */
diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c
new file mode 100644 (file)
index 0000000..389ffbb
--- /dev/null
@@ -0,0 +1,300 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+
+#define MAX_TIME 100000
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        if ((rc = PtlMDUnlink (client->md_in_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+        int i, magic;
+        i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned));
+        magic = *(int *)(ev->mem_desc.start + ev->offset);
+
+        if(magic != 0xcafebabe) {
+                printk ("Unexpected response \n");
+                return 1;
+        }
+
+        if((i == count) || !count)
+                wake_up_process (client->tsk);
+        else
+                printk ("Received response after timeout for %d\n",i);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        unsigned ping_bulk_magic = PING_BULK_MAGIC;
+        int rc;
+        struct timeval tv1, tv2;
+        client->tsk = current;
+        client->args = args;
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,
+                        (args->ioc_size + STDSIZE) * args->ioc_count);
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        count = args->ioc_count;
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = (args->ioc_size + STDSIZE)
+                                                * count;
+        client->md_in_head.threshold = PTL_MD_THRESH_INF;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE + args->ioc_size;
+        client->md_out_head.threshold = args->ioc_count;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic));
+
+        count = 0;
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+        while ((args->ioc_count - count)) {
+                memcpy (client->outbuf + sizeof(unsigned),
+                       &(count), sizeof(unsigned));
+                 /* Put the ping packet */
+                do_gettimeofday (&tv1);
+
+                memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1,
+                       sizeof(struct timeval));
+
+                if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                         PDEBUG ("PtlPut (header)", rc);
+                         pingcli_shutdown (1);
+                         return NULL;
+                }
+                printk ("sent msg no %d", count);
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                rc = schedule_timeout (20 * args->ioc_timeout);
+                if (rc == 0) {
+                        printk ("   ::  timeout .....\n");
+                } else {
+                        do_gettimeofday (&tv2);
+                        printk("   ::  Reply in %u usec\n",
+                                (unsigned)((tv2.tv_sec - tv1.tv_sec)
+                                 * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
+                }
+                count++;
+        }
+
+        if (client->outbuf != NULL)
+                PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size);
+
+        if (client->inbuf != NULL)
+                PORTAL_FREE (client->inbuf,
+                               (args->ioc_size + STDSIZE) * args->ioc_count);
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        memset (client, 0, sizeof(struct pingcli_data));
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c
new file mode 100644 (file)
index 0000000..1037d09
--- /dev/null
@@ -0,0 +1,308 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+#define MAXSIZE (16*1024*1024)
+
+static unsigned ping_head_magic;
+static unsigned ping_bulk_magic;
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                case 5:
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, MAXSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        unsigned long magic;
+        unsigned long ping_bulk_magic = 0xcafebabe;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk =  current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+               
+                magic =  *((int *)(server->evnt.mem_desc.start 
+                                        + server->evnt.offset));
+                
+                
+                if(magic != 0xdeadbeef) {
+                        printk("Unexpected Packet to the server\n");
+                        
+                } 
+                memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
+                                
+                server->mdout.length    = server->evnt.rlength;
+                server->mdout.start     = server->in_buf;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = MAXSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)),
+               *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))),
+               *((int *)(ev->mem_desc.start + ev->offset + 2 * 
+                               sizeof(unsigned))));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "NAL %d not loaded\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, MAXSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = MAXSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        ping_head_magic = PING_HEADER_MAGIC;
+        ping_bulk_magic = PING_BULK_MAGIC;
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c
new file mode 100644 (file)
index 0000000..4cef08b
--- /dev/null
@@ -0,0 +1,276 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't
+ * send multiple packets in a single ioctl.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes
+                                                   assumed */
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+                        if (client->outbuf != NULL)
+                                PORTAL_FREE (client->outbuf, STDSIZE);
+
+                        if (client->inbuf != NULL)
+                                PORTAL_FREE (client->inbuf, STDSIZE);
+
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+                wake_up_process (client->tsk);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        const ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        int rc;
+
+        client->tsk = current;
+        client->args = args;
+
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,  STDSIZE);
+
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded.\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = STDSIZE;
+        client->md_in_head.threshold = 1;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, STDSIZE);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE;
+        client->md_out_head.threshold = 1;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Put the ping packet */
+        if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                         client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                PDEBUG ("PtlPut (header)", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+
+        count = 0;
+        set_current_state (TASK_INTERRUPTIBLE);
+        rc = schedule_timeout (20 * args->ioc_timeout);
+        if (rc == 0) {
+                printk (" Time out on the server\n");
+                pingcli_shutdown (2);
+                return NULL;
+        } else
+                printk("Received respose from the server \n");
+
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        memset (client, 0, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c
new file mode 100644 (file)
index 0000000..a18ea35
--- /dev/null
@@ -0,0 +1,295 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't 
+ * send multiple packets in a single ioctl.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4)
+
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#endif
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, STDSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk = current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+                               
+                server->mdout.start     = server->in_buf;
+                server->mdout.length    = STDSIZE;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = STDSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, STDSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = STDSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lustre/portals/tests/startclient.sh b/lustre/portals/tests/startclient.sh
new file mode 100755 (executable)
index 0000000..c9b7c16
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingcli.o
+else
+       PING=spingcli.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING 
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+exit 0;
diff --git a/lustre/portals/tests/startserver.sh b/lustre/portals/tests/startserver.sh
new file mode 100755 (executable)
index 0000000..942300e
--- /dev/null
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingsrv.o
+else
+       PING=spingsrv.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING nal=4
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING nal=2
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING nal=4
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+../utils/acceptor 9999&
+exit 0;
diff --git a/lustre/portals/tests/stopclient.sh b/lustre/portals/tests/stopclient.sh
new file mode 100755 (executable)
index 0000000..f7e3aa1
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingcli
+else
+       PING=pingcli
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+rmmod portals
diff --git a/lustre/portals/tests/stopserver.sh b/lustre/portals/tests/stopserver.sh
new file mode 100644 (file)
index 0000000..3e81831
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingsrv
+else
+       PING=pingsrv
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+killall -9 acceptor
+rm -f /var/run/acceptor-9999.pid
+rmmod portals
diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am
new file mode 100644 (file)
index 0000000..b62b401
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lustre/portals/unals/README b/lustre/portals/unals/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c
new file mode 100644 (file)
index 0000000..89c9f78
--- /dev/null
@@ -0,0 +1,293 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(connection c)
+{
+    return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h
new file mode 100644 (file)
index 0000000..f6b2994
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m,
+                                unsigned int ip,  
+                               unsigned int short);
+manager init_connections(unsigned short,
+                         int (*f)(void *,connection),
+                         void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len);
diff --git a/lustre/portals/unals/debug.c b/lustre/portals/unals/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lustre/portals/unals/ipmap.h b/lustre/portals/unals/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lustre/portals/unals/pqtimer.c b/lustre/portals/unals/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lustre/portals/unals/pqtimer.h b/lustre/portals/unals/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lustre/portals/unals/table.c b/lustre/portals/unals/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lustre/portals/unals/table.h b/lustre/portals/unals/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c
new file mode 100644 (file)
index 0000000..8bf55c4
--- /dev/null
@@ -0,0 +1,196 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a,connection c)
+{
+    bridge b=a;
+    ptl_hdr_t hdr;
+    if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+        lib_parse(b->nal_cb, &hdr, c);
+        return(1);
+    }
+    return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lustre/portals/unals/timer.h b/lustre/portals/unals/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lustre/portals/unals/utypes.h b/lustre/portals/unals/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lustre/portals/utils/.cvsignore b/lustre/portals/utils/.cvsignore
new file mode 100644 (file)
index 0000000..041cd6b
--- /dev/null
@@ -0,0 +1,7 @@
+Makefile
+Makefile.in
+acceptor
+debugctl
+ptlctl
+.deps
+routerstat
diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am
new file mode 100644 (file)
index 0000000..065fcf9
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+COMPILE = gcc -Wall -g -I$(srcdir)/../include 
+LINK = gcc -o $@
+
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat
+lib_LIBRARIES = libptlctl.a
+
+acceptor_SOURCES = acceptor.c # -lefence
+
+libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+
+ptlctl_SOURCES = ptlctl.c
+ptlctl_LDADD =  -L. -lptlctl -lncurses # -lefence
+ptlctl_DEPENDENCIES = libptlctl.a
+
+debugctl_SOURCES = debugctl.c
+debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
+debugctl_DEPENDENCIES = libptlctl.a
+
+routerstat_SOURCES = routerstat.c
diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c
new file mode 100644 (file)
index 0000000..c6590db
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <asm/byteorder.h>
+#include <syslog.h>
+
+#include <errno.h>
+
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+/* should get this from autoconf somehow */
+#ifndef PIDFILE_DIR
+#define PIDFILE_DIR "/var/run"
+#endif 
+
+#define PROGNAME "acceptor"
+
+void create_pidfile(char *name, int port)
+{
+        char pidfile[1024];
+        FILE *fp;
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if ((fp = fopen(pidfile, "w"))) {
+                fprintf(fp, "%d\n", getpid());
+                fclose(fp);
+        } else {
+                syslog(LOG_ERR, "%s: %s\n", pidfile, 
+                       strerror(errno));
+        }
+}
+
+int pidfile_exists(char *name, int port)
+{
+        char pidfile[1024];
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if (!access(pidfile, F_OK)) {
+                fprintf(stderr, "%s: exists, acceptor already running.\n", 
+                        pidfile);
+                return (1);
+        } 
+        return (0);
+}
+
+int
+parse_size (int *sizep, char *str)
+{
+        int             size;
+        char            mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod))
+        {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod)
+                {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+void
+show_connection (int fd, __u32 net_ip, ptl_nid_t nid)
+{
+        struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET);
+        __u32 host_ip = ntohl (net_ip);
+        int  rxmem = 0;
+        int  txmem = 0;
+        int  nonagle = 0;
+        int  len;
+        char host[1024];
+        
+        len = sizeof (txmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0)
+                perror ("Cannot get write buffer size");
+        
+        len = sizeof (rxmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0)
+                perror ("Cannot get read buffer size");
+        
+        len = sizeof (nonagle);
+        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0)
+                perror ("Cannot get nagle");
+
+        if (h == NULL)
+                snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff,
+                                    (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff);
+        else
+                snprintf (host, sizeof(host), "%s", h->h_name);
+                
+        syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", 
+                 host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled");
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+void
+usage (char *myname)
+{
+        fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname);
+        exit (1);
+}
+
+int main(int argc, char **argv)
+{
+        int o, fd, rc, port, pfd;
+        struct sockaddr_in srvaddr;
+        int c;
+        int rxmem = 0;
+        int txmem = 0;
+        int noclose = 0;
+        int nonagle = 1;
+        int nal = SOCKNAL;
+        int xchg_nids = 0;
+        int bind_irq = 0;
+        
+        while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1)
+                switch (c)
+                {
+                case 'r':
+                        if (parse_size (&rxmem, optarg) != 0 || rxmem < 0)
+                                usage (argv[0]);
+                        break;
+                        
+                case 's':
+                        if (parse_size (&txmem, optarg) != 0 || txmem < 0)
+                                usage (argv[0]);
+                        break;
+
+                case 'n':
+                        nonagle = 0;
+                        break;
+
+                case 'l':
+                        noclose = 1;
+                        break;
+
+                case 'x':
+                        xchg_nids = 1;
+                        break;
+
+                case 'i':
+                        bind_irq = 1;
+                        break;
+                        
+                case 'N':
+                        if (parse_size(&nal, optarg) != 0 || 
+                            nal < 0 || nal > NAL_MAX_NR)
+                                usage(argv[0]);
+                        break;
+                        
+                default:
+                        usage (argv[0]);
+                        break;
+                }
+
+        if (optind >= argc)
+                usage (argv[0]);
+
+        port = atol(argv[optind++]);
+
+        if (pidfile_exists(PROGNAME, port))
+                exit(1);
+
+        memset(&srvaddr, 0, sizeof(srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(port);
+        srvaddr.sin_addr.s_addr = INADDR_ANY;
+
+        fd = socket(PF_INET, SOCK_STREAM, 0);
+        if (fd < 0) {
+                perror("opening socket");
+                exit(1);
+        }
+
+        o = 1;
+        if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) {
+                perror("Cannot set REUSEADDR socket opt");
+                exit(1);
+        }
+
+        if (nonagle)
+        {
+                o = 1;
+                rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
+                if (rc != 0) 
+                { 
+                        perror ("Cannot disable nagle");
+                        exit (1);
+                }
+        }
+
+        if (txmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set write buffer size");
+                        exit (1);
+                }
+        }
+        
+        if (rxmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set read buffer size");
+                        exit (1);
+               }
+        }
+                
+        rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+        if ( rc == -1 ) {
+                perror("bind: ");
+                exit(1);
+        }
+
+        if (listen(fd, 127)) {
+                perror("listen: ");
+                exit(1);
+        }
+        fprintf(stderr, "listening on port %d\n", port);
+
+        pfd = open("/dev/portals", O_RDWR);
+        if ( pfd < 0 ) {
+                perror("opening portals device");
+                exit(1);
+        }
+
+        rc = daemon(1, noclose);
+        if (rc < 0) {
+                perror("daemon(): ");
+                exit(1);
+        }
+
+        openlog(PROGNAME, LOG_PID, LOG_DAEMON);
+        syslog(LOG_INFO, "started, listening on port %d\n", port);
+        create_pidfile(PROGNAME, port);
+
+        while (1) {
+                struct sockaddr_in clntaddr;
+                int len = sizeof(clntaddr);
+                int cfd;
+                struct portal_ioctl_data data;
+                ptl_nid_t peer_nid;
+                
+                cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
+                if ( cfd < 0 ) {
+                        perror("accept");
+                        exit(0);
+                        continue;
+                }
+
+                if (!xchg_nids)
+                        peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */
+                else
+                {
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = nal;
+                        rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data);
+                        if (rc < 0)
+                        {
+                                perror ("Can't get my NID");
+                                close (cfd);
+                                continue;
+                        }
+                        
+                        rc = exchange_nids (cfd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (cfd);
+                                continue;
+                        }
+                }
+
+                show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid);
+                
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = cfd;
+                data.ioc_nal = nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
+                        perror("ioctl failed");
+
+                } else {
+                        printf("client registered\n");
+                }
+                rc = close(cfd);
+                if (rc)
+                        perror ("close failed");
+        }
+
+        closelog();
+        exit(0);
+
+}
diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c
new file mode 100644 (file)
index 0000000..13572dc
--- /dev/null
@@ -0,0 +1,620 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <syscall.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#define BUG()                            /* workaround for module.h includes */
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/module.h>
+#endif
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+static char rawbuf[8192];
+static char *buf = rawbuf;
+static int max = 8192;
+//static int g_pfd = -1;
+static int subsystem_array[1 << 8];
+static int debug_mask = ~0;
+
+static const char *portal_debug_subsystems[] =
+        {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite",
+         "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter",
+         "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL};
+static const char *portal_debug_masks[] =
+        {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
+         "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
+         "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL};
+
+struct debug_daemon_cmd {
+        char *cmd;
+        unsigned int cmdv;
+};
+
+static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = {
+        {"start", DEBUG_DAEMON_START},
+        {"stop", DEBUG_DAEMON_STOP},
+        {"pause", DEBUG_DAEMON_PAUSE},
+        {"continue", DEBUG_DAEMON_CONTINUE},
+        {0, 0}
+};
+
+static int do_debug_mask(char *name, int enable)
+{
+        int found = 0, i;
+
+        for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_subsystems[i]) == 0 ||
+                    strcasecmp(name, "all_subs") == 0) {
+                        printf("%s output from subsystem \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_subsystems[i]);
+                        subsystem_array[i] = enable;
+                        found = 1;
+                }
+        }
+        for (i = 0; portal_debug_masks[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_masks[i]) == 0 ||
+                    strcasecmp(name, "all_types") == 0) {
+                        printf("%s output of type \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_masks[i]);
+                        if (enable)
+                                debug_mask |= (1 << i);
+                        else
+                                debug_mask &= ~(1 << i);
+                        found = 1;
+                }
+        }
+
+        return found;
+}
+
+int dbg_initialize(int argc, char **argv)
+{
+        memset(subsystem_array, 1, sizeof(subsystem_array));
+        return 0;
+}
+
+int jt_dbg_filter(int argc, char **argv)
+{
+        int   i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 0))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+        return 0;
+}
+
+int jt_dbg_show(int argc, char **argv)
+{
+        int    i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 1))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+
+        return 0;
+}
+
+static int applymask(char* procpath, int value)
+{
+        int rc;
+        char buf[64];
+        int len = snprintf(buf, 64, "%d", value);
+
+        int fd = open(procpath, O_WRONLY);
+        if (fd == -1) {
+                fprintf(stderr, "Unable to open %s: %s\n",
+                        procpath, strerror(errno));
+                return fd;
+        }
+        rc = write(fd, buf, len+1);
+        if (rc<0) {
+                fprintf(stderr, "Write to %s failed: %s\n",
+                        procpath, strerror(errno));
+                return rc;
+        }
+        close(fd);
+        return 0;
+}
+
+extern char *dump_filename;
+extern int dump(int dev_id, int opc, void *buf);
+
+static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
+{
+        if (!dump_filename) {
+                applymask("/proc/sys/portals/subsystem_debug", subs_mask);
+                applymask("/proc/sys/portals/debug", debug_mask);
+        } else {
+                struct portals_debug_ioctl_data data;
+
+                data.hdr.ioc_len = sizeof(data);
+                data.hdr.ioc_version = 0;
+                data.subs = subs_mask;
+                data.debug = debug_mask;
+
+                dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data);
+        }
+        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n",
+               subs_mask, debug_mask);
+}
+
+int jt_dbg_list(int argc, char **argv)
+{
+        int i;
+
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s <subs || types>\n", argv[0]);
+                return 0;
+        }
+
+        if (strcasecmp(argv[1], "subs") == 0) {
+                printf("Subsystems: all_subs");
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++)
+                        printf(", %s", portal_debug_subsystems[i]);
+                printf("\n");
+        } else if (strcasecmp(argv[1], "types") == 0) {
+                printf("Types: all_types");
+                for (i = 0; portal_debug_masks[i] != NULL; i++)
+                        printf(", %s", portal_debug_masks[i]);
+                printf("\n");
+        }
+        else if (strcasecmp(argv[1], "applymasks") == 0) {
+                unsigned int subsystem_mask = 0;
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                        if (subsystem_array[i]) subsystem_mask |= (1 << i);
+                }
+                applymask_all(subsystem_mask, debug_mask);
+        }
+        return 0;
+}
+
+/* if 'raw' is true, don't strip the debug information from the front of the
+ * lines */
+static void dump_buffer(FILE *fd, char *buf, int size, int raw)
+{
+        char *p, *z;
+        unsigned long subsystem, debug, dropped = 0, kept = 0;
+        int max_sub, max_type;
+
+        for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++)
+                ;
+        for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++)
+                ;
+
+        while (size) {
+                p = memchr(buf, '\n', size);
+                if (!p)
+                        break;
+                subsystem = strtoul(buf, &z, 16);
+                debug = strtoul(z + 1, &z, 16);
+
+                z++;
+                /* for some reason %*s isn't working. */
+                *p = '\0';
+                if (subsystem < max_sub &&
+                    subsystem_array[subsystem] &&
+                    (!debug || (debug_mask & debug))) {
+                        if (raw)
+                                fprintf(fd, "%s\n", buf);
+                        else
+                                fprintf(fd, "%s\n", z);
+                        //printf("%s\n", buf);
+                        kept++;
+                } else {
+                        //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf);
+                        dropped++;
+                }
+                *p = '\n';
+                p++;
+                size -= (p - buf);
+                buf = p;
+        }
+
+        printf("Debug log: %lu lines, %lu kept, %lu dropped.\n",
+                dropped + kept, kept, dropped);
+}
+
+int jt_dbg_debug_kernel(int argc, char **argv)
+{
+        int rc, raw = 1;
+        FILE *fd = stdout;
+        const int databuf_size = (6 << 20);
+        struct portal_ioctl_data data, *newdata;
+        char *databuf = NULL;
+
+        if (argc > 3) {
+                fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc > 1) {
+                fd = fopen(argv[1], "w");
+                if (fd == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                                strerror(errno));
+                        return -1;
+                }
+        }
+        if (argc > 2)
+                raw = atoi(argv[2]);
+
+        databuf = malloc(databuf_size);
+        if (!databuf) {
+                fprintf(stderr, "No memory for buffer.\n");
+                goto out;
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_plen1 = databuf_size;
+        data.ioc_pbuf1 = databuf;
+
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                goto out;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n",
+                        strerror(errno));
+                goto out;
+        }
+
+        newdata = (struct portal_ioctl_data *)buf;
+        if (newdata->ioc_size > 0)
+                dump_buffer(fd, databuf, newdata->ioc_size, raw);
+        else
+                fprintf(stderr, "No data in the debug buffer.\n");
+
+ out:
+        if (databuf)
+                free(databuf);
+        if (fd != stdout)
+                fclose(fd);
+        return 0;
+}
+
+int jt_dbg_debug_daemon(int argc, char **argv)
+{
+        int i, rc;
+        unsigned int cmd = 0;
+        FILE *fd = stdout;
+        struct portal_ioctl_data data;
+
+        if (argc <= 1) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) {
+                if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) {
+                        cmd = portal_debug_daemon_cmd[i].cmdv;
+                        break;
+                }
+        }
+        if (portal_debug_daemon_cmd[i].cmd == NULL) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        memset(&data, 0, sizeof(data));
+        if (cmd == DEBUG_DAEMON_START) {
+                if (argc < 3) {
+                        fprintf(stderr, "usage: %s [start file <#MB>|stop|"
+                                "pause|continue]\n", argv[0]);
+                        return 0;
+                }
+                if (access(argv[2], F_OK) != 0) {
+                        fd = fopen(argv[2], "w");
+                        if (fd != NULL) {
+                                fclose(fd);
+                                remove(argv[2]);
+                                goto ok;
+                        }
+                }
+                if (access(argv[2], W_OK) == 0)
+                        goto ok;
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                        strerror(errno));
+                return -1;
+ok:
+                data.ioc_inllen1 = strlen(argv[2]) + 1;
+                data.ioc_inlbuf1 = argv[2];
+                data.ioc_misc = 0;
+                if (argc == 4) {
+                        unsigned long size;
+                        errno = 0;
+                        size = strtoul(argv[3], NULL, 0);
+                        if (errno) {
+                                fprintf(stderr, "file size(%s): error %s\n",
+                                        argv[3], strerror(errno));
+                                return -1;
+                        }
+                        data.ioc_misc = size;
+                }
+        }
+        data.ioc_count = cmd;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf);
+        if (rc < 0) {
+                fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n",
+                                strerror(errno));
+                return rc;
+        }
+        return 0;
+}
+
+int jt_dbg_debug_file(int argc, char **argv)
+{
+        int rc, fd = -1, raw = 1;
+        FILE *output = stdout;
+        char *databuf = NULL;
+        struct stat statbuf;
+
+        if (argc > 4 || argc < 2) {
+                fprintf(stderr, "usage: %s <input> [output] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        fd = open(argv[1], O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                        strerror(errno));
+                return -1;
+        }
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+        rc = syscall(__SYS_fstat__, fd, &statbuf);
+        if (rc < 0) {
+                fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        if (argc >= 3) {
+                output = fopen(argv[2], "w");
+                if (output == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                                strerror(errno));
+                        goto out;
+                }
+        }
+
+        if (argc == 4)
+                raw = atoi(argv[3]);
+
+        databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+        if (databuf == NULL) {
+                fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        dump_buffer(output, databuf, statbuf.st_size, raw);
+
+ out:
+        if (databuf)
+                munmap(databuf, statbuf.st_size);
+        if (output != stdout)
+                fclose(output);
+        if (fd > 0)
+                close(fd);
+        return 0;
+}
+
+int jt_dbg_clear_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_dbg_mark_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+        char *text;
+        time_t now = time(NULL);
+
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [marker text]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc == 2) {
+                text = argv[1];
+        } else {
+                text = ctime(&now);
+                text[strlen(text) - 1] = '\0'; /* stupid \n */
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_inllen1 = strlen(text) + 1;
+        data.ioc_inlbuf1 = text;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+
+int jt_dbg_modules(int argc, char **argv)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        struct mod_paths {
+                char *name, *path;
+        } *mp, mod_paths[] = {
+                {"portals", "portals/linux/oslib"},
+                {"ksocknal", "portals/linux/socknal"},
+                {"obdclass", "lustre/obdclass"},
+                {"ptlrpc", "lustre/ptlrpc"},
+                {"obdext2", "lustre/obdext2"},
+                {"ost", "lustre/ost"},
+                {"osc", "lustre/osc"},
+                {"mds", "lustre/mds"},
+                {"mdc", "lustre/mdc"},
+                {"llite", "lustre/llite"},
+                {"obdecho", "lustre/obdecho"},
+                {"ldlm", "lustre/ldlm"},
+                {"obdfilter", "lustre/obdfilter"},
+                {"extN", "lustre/extN"},
+                {"lov", "lustre/lov"},
+                {"fsfilt_ext3", "lustre/obdclass"},
+                {"fsfilt_extN", "lustre/obdclass"},
+                {"mds_ext2", "lustre/mds"},
+                {"mds_ext3", "lustre/mds"},
+                {"mds_extN", "lustre/mds"},
+                {"ptlbd", "lustre/ptlbd"},
+                {NULL, NULL}
+        };
+        char *path = "..";
+        char *kernel = "linux";
+
+        if (argc >= 2)
+                path = argv[1];
+        if (argc == 3)
+                kernel = argv[2];
+        if (argc > 3) {
+                printf("%s [path] [kernel]\n", argv[0]);
+                return 0;
+        }
+
+        for (mp = mod_paths; mp->name != NULL; mp++) {
+                struct module_info info;
+                int rc;
+                size_t crap;
+                int query_module(const char *name, int which, void *buf,
+                                 size_t bufsize, size_t *ret);
+
+                rc = query_module(mp->name, QM_INFO, &info, sizeof(info),
+                                  &crap);
+                if (rc < 0) {
+                        if (errno != ENOENT)
+                                printf("query_module(%s) failed: %s\n",
+                                       mp->name, strerror(errno));
+                } else {
+                        printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path,
+                               mp->path, mp->name,
+                               info.addr + sizeof(struct module));
+                }
+        }
+
+        return 0;
+#else
+        printf("jt_dbg_module is not yet implemented for Linux 2.5\n");
+        return 0;
+#endif /* linux 2.5 */
+}
+
+int jt_dbg_panic(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
diff --git a/lustre/portals/utils/debugctl.c b/lustre/portals/utils/debugctl.c
new file mode 100644 (file)
index 0000000..02cb9b4
--- /dev/null
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+
+command_t list[] = {
+        {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"},
+        {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, 
+        {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file <input> [output] [raw], read debug buffer from input and print it [to output]"},
+        {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"},
+        {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"},
+        {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"},
+        {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"},
+        {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"},
+        {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: <path>)"},
+        {"panic", jt_dbg_panic, 0, "cause the kernel to panic"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (dbg_initialize(argc, argv) < 0)
+                exit(2);
+
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+
+        Parser_init("debugctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        unregister_ioc_dev(PORTALS_DEV_ID);
+        return 0;
+}
diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c
new file mode 100644 (file)
index 0000000..722bb57
--- /dev/null
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+struct ioc_dev {
+       const char * dev_name;
+       int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+struct dump_hdr {
+       int magic;
+       int dev_id;
+       int opc;
+};
+
+char * dump_filename;
+
+static int
+open_ioc_dev(int dev_id) 
+{
+       const char * dev_name;
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       dev_name = ioc_dev_list[dev_id].dev_name;
+       if (dev_name == NULL) {
+                fprintf(stderr, "unknown device id: %d\n", dev_id);
+               return -EINVAL;
+       }
+
+       if (ioc_dev_list[dev_id].dev_fd < 0) {
+               int fd = open(dev_name, O_RDWR);
+               
+               if (fd < 0) {
+                       fprintf(stderr, "opening %s failed: %s\n"
+                               "hint: the kernel modules may not be loaded\n",
+                               dev_name, strerror(errno));
+                       return fd;
+               }
+               ioc_dev_list[dev_id].dev_fd = fd;
+       }
+
+       return ioc_dev_list[dev_id].dev_fd;
+}
+
+
+static int 
+do_ioctl(int dev_id, int opc, void *buf)
+{
+       int fd, rc;
+       
+       fd = open_ioc_dev(dev_id);
+       if (fd < 0) 
+               return fd;
+
+       rc = ioctl(fd, opc, buf);
+       return rc;
+       
+}
+
+static FILE *
+get_dump_file() 
+{
+       FILE *fp = NULL;
+       
+       if (!dump_filename) {
+               fprintf(stderr, "no dump filename\n");
+       } else 
+               fp = fopen(dump_filename, "a");
+       return fp;
+}
+
+/*
+ * The dump file should start with a description of which devices are
+ * used, but for now it will assumed whatever app reads the file will
+ * know what to do. */
+int 
+dump(int dev_id, int opc, void *buf)
+{
+       FILE *fp;
+       struct dump_hdr dump_hdr;
+       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+       int rc;
+       
+       printf("dumping opc %x to %s\n", opc, dump_filename);
+       
+
+       dump_hdr.magic = 0xdeadbeef;
+       dump_hdr.dev_id = dev_id;
+       dump_hdr.opc = opc;
+
+       fp = get_dump_file();
+       if (fp == NULL) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+       if (rc == 1)
+               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+       fclose(fp);
+       if (rc != 1) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       return 0;
+}
+
+/* register a device to send ioctls to.  */
+int 
+register_ioc_dev(int dev_id, const char * dev_name) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       unregister_ioc_dev(dev_id);
+
+       ioc_dev_list[dev_id].dev_name = dev_name;
+       ioc_dev_list[dev_id].dev_fd = -1;
+
+       return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return;
+       if (ioc_dev_list[dev_id].dev_name != NULL &&
+           ioc_dev_list[dev_id].dev_fd >= 0) 
+               close(ioc_dev_list[dev_id].dev_fd);
+
+       ioc_dev_list[dev_id].dev_name = NULL;
+       ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+/* If this file is set, then all ioctl buffers will be 
+   appended to the file. */
+int
+set_ioctl_dump(char * file)
+{
+       if (dump_filename)
+               free(dump_filename);
+       
+       dump_filename = strdup(file);
+       return 0;
+}
+
+int
+l_ioctl(int dev_id, int opc, void *buf)
+{
+       if (dump_filename) 
+               return dump(dev_id, opc, buf);
+       else 
+               return do_ioctl(dev_id, opc, buf);
+}
+
+/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
+ * in the file.  For example:
+ *
+ * parse_dump("lctl.dump", l_ioctl);
+ *
+ * Note: if using l_ioctl, then you also need to register_ioc_dev() for 
+ * each device used in the dump.
+ */
+int 
+parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
+{
+       int fd, line =0;
+       struct stat st;
+       char *buf, *end;
+       
+       fd = syscall(SYS_open, dump_file, O_RDONLY);
+
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+       if (syscall(__SYS_fstat__, fd, &st)) { 
+               perror("stat fails");
+               exit(1);
+       }
+
+       if (st.st_size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
+
+       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = buf + st.st_size;
+       close(fd);
+       while (buf < end) {
+               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+               struct portal_ioctl_hdr * data;
+               char tmp[8096];
+               int rc;
+               
+               line++;
+
+               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+               if (buf + data->ioc_len > end ) {
+                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                               data->ioc_len, end);
+                       return -1;
+               }
+#if 0
+               printf ("dump_hdr: %lx data: %lx\n",
+                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+               
+               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                      data->ioc_len, data->ioc_version);
+#endif
+
+               memcpy(tmp, data, data->ioc_len);
+
+               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+               if (rc) {
+                       printf("failed: %d\n", rc);
+                       exit(1);
+               }
+
+               buf += data->ioc_len + sizeof(*dump_hdr);
+       }
+       return 0;
+}
+
+int 
+jt_ioc_dump(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+       printf("setting dumpfile to: %s\n", argv[1]);
+       
+       set_ioctl_dump(argv[1]);
+       return 0;
+}
diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c
new file mode 100644 (file)
index 0000000..4d93645
--- /dev/null
@@ -0,0 +1,703 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <assert.h>
+
+#include <config.h>
+#ifdef HAVE_LIBREADLINE
+#define        READLINE_LIBRARY
+#include <readline/readline.h>
+#endif
+//extern char **completion_matches __P((char *, rl_compentry_func_t *));
+extern void using_history(void);
+extern void stifle_history(int);
+extern void add_history(char *);
+
+#include "parser.h"
+
+static command_t * top_level;      /* Top level of commands, initialized by
+                                    * InitParser                            */
+static char * parser_prompt = NULL;/* Parser prompt, set by InitParser      */
+static int done;                  /* Set to 1 if user types exit or quit   */
+
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+                   char **prev);
+static void print_commands(char *str, command_t *table);
+
+static char * skipwhitespace(char * s)
+{
+    char * t;
+    int    len;
+
+    len = (int)strlen(s);
+    for (t = s; t <= s + len && isspace(*t); t++);
+    return(t);
+}
+
+
+static char * skiptowhitespace(char * s)
+{
+    char * t;
+
+    for (t = s; *t && !isspace(*t); t++);
+    return(t);
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+    char *arg;
+    int i = 0;
+
+    arg = strtok(line, " \t");
+    if ( arg ) {
+       argv[i] = arg;
+       i++;
+    } else
+       return 0;
+
+    while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) {
+       argv[i] = arg;
+       i++;
+    }
+    return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+       command_t *cmd;
+
+       for (cmd = cmds; cmd->pc_name; cmd++) {
+               if (strcmp(name, cmd->pc_name) == 0)
+                       return cmd;
+       }
+       return NULL;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+       command_t *cmd;
+
+        cmd = Parser_findargcmd(argv[0], cmds);
+       if ( cmd ) {
+               return (cmd->pc_func)(argc, argv);
+       } else {
+               printf("Try interactive use without arguments or use one of:\n");
+               for (cmd = cmds; cmd->pc_name; cmd++)
+                       printf("\"%s\" ", cmd->pc_name);
+               printf("\nas argument.\n");
+       }
+       return -1;
+}
+
+/* returns the command_t * (NULL if not found) corresponding to a
+   _partial_ match with the first token in name.  It sets *next to
+   point to the following token. Does not modify *name. */
+static command_t * find_cmd(char * name, command_t cmds[], char ** next)
+{
+        int    i, len;
+    
+        if (!cmds || !name ) 
+                return NULL;
+    
+        /* This sets name to point to the first non-white space character,
+           and next to the first whitespace after name, len to the length: do
+           this with strtok*/
+        name = skipwhitespace(name);
+        *next = skiptowhitespace(name);
+        len = *next - name;
+        if (len == 0) 
+                return NULL;
+
+        for (i = 0; cmds[i].pc_name; i++) {
+                if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+                        *next = skipwhitespace(*next);
+                        return(&cmds[i]);
+                }
+        }
+        return NULL;
+}
+
+/* Recursively process a command line string s and find the command
+   corresponding to it. This can be ambiguous, full, incomplete,
+   non-existent. */
+static int process(char *s, char ** next, command_t *lookup,
+                  command_t **result, char **prev)
+{
+    *result = find_cmd(s, lookup, next);
+    *prev = s;
+
+        /* non existent */
+        if ( ! *result ) 
+                return CMD_NONE;
+
+        /* found entry: is it ambigous, i.e. not exact command name and
+           more than one command in the list matches.  Note that find_cmd
+           points to the first ambiguous entry */
+        if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) &&
+             find_cmd(s, (*result) + 1, next)) 
+                return CMD_AMBIG;
+
+        /* found a unique command: component or full? */
+        if ( (*result)->pc_func ) {
+                return CMD_COMPLETE;
+        } else {
+                if ( *next == '\0' ) {
+                        return CMD_INCOMPLETE;
+                } else {
+                        return process(*next, next, (*result)->pc_sub_cmd, result, prev);
+                }
+        }
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t * match_tbl;   /* Command completion against this table */
+static char * command_generator(const char * text, int state)
+{
+        static int index,
+                len;
+        char       *name;
+
+        /* Do we have a match table? */
+        if (!match_tbl)
+                return NULL;
+
+        /* If this is the first time called on this word, state is 0 */
+        if (!state) {
+                index = 0;
+                len = (int)strlen(text);
+        }
+
+        /* Return next name in the command list that paritally matches test */
+        while ( (name = (match_tbl + index)->pc_name) ) {
+                index++;
+
+                if (strncasecmp(name, text, len) == 0) {
+                        return(strdup(name));
+                }
+        }
+
+    /* No more matches */
+    return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(char * text, int start, int end)
+{
+    command_t  * table;
+    char       * pos;
+
+    match_tbl = top_level;
+    for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+        table;
+        table = find_cmd(pos, match_tbl, &pos)) {
+
+       if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd;
+    }
+
+    return(completion_matches(text, command_generator));
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char * line)
+{
+        command_t         *cmd, *ambig;
+        char *prev;
+        char *next, *tmp;
+        char *argv[MAXARGS];
+        int         i;
+        int rc = 0;
+
+        switch( process(line, &next, top_level, &cmd, &prev) ) {
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, cmd, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        cmd = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "No such command, type help\n");
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_COMPLETE:
+                i = line2args(line, argv, MAXARGS);
+                rc = (cmd->pc_func)(i, argv);
+
+                if (rc == CMD_HELP)
+                        fprintf(stderr, "%s\n", cmd->pc_help);
+
+                break;
+        }
+
+        return rc;
+}
+
+int
+noop_fn ()
+{
+        return (0);
+}
+
+/* just in case you're ever in an airplane and discover you 
+   forgot to install readline-dev. :) */
+int init_input() 
+{
+        int   interactive = isatty (fileno (stdin));
+
+#ifdef HAVE_LIBREADLINE
+        using_history();
+        stifle_history(HISTORY);
+
+        if (!interactive)
+        {
+                rl_prep_term_function = (rl_vintfunc_t *)noop_fn;
+                rl_deprep_term_function = (rl_voidfunc_t *)noop_fn;
+        }
+
+        rl_attempted_completion_function = (CPPFunction *)command_completion;
+        rl_completion_entry_function = (void *)command_generator;
+#endif 
+        return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char * readline(char * prompt) 
+{
+        char line[2048];
+        int n = 0;
+        if (prompt)
+                printf ("%s", prompt);
+        if (fgets(line, sizeof(line), stdin) == NULL)
+                return (NULL);
+        n = strlen(line);
+        if (n && line[n-1] == '\n')
+                line[n-1] = '\0';
+        return strdup(line);
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+        char *line, *s;
+        int rc = 0;
+        int interactive;
+        
+        interactive = init_input();
+
+        while(!done) {
+                line = readline(interactive ? parser_prompt : NULL);
+
+                if (!line) break;
+
+                s = skipwhitespace(line);
+
+                if (*s) {
+                        add_history(s);
+                        rc = execute_line(s);
+                }
+                
+                free(line);
+        }
+        return rc;
+}
+
+
+/* sets the parser prompt */
+void Parser_init(char * prompt, command_t * cmds)
+{
+    done = 0;
+    top_level = cmds;
+    if (parser_prompt) free(parser_prompt);
+    parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+    done = 1;
+    free(parser_prompt);
+    parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+    int ret;
+
+    if (*s != '0')
+       ret = sscanf(s, "%d", val);
+    else if (*(s+1) != 'x')
+       ret = sscanf(s, "%o", val);
+    else {
+       s++;
+       ret = sscanf(++s, "%x", val);
+    }
+
+    return(ret);
+}
+
+
+void Parser_qhelp(int argc, char *argv[]) {
+
+    printf("Available commands are:\n");
+
+    print_commands(NULL, top_level);
+    printf("For more help type: help command-name\n");
+}
+
+int Parser_help(int argc, char **argv) 
+{
+        char line[1024];
+        char *next, *prev, *tmp;
+        command_t *result, *ambig;
+        int i;
+
+        if ( argc == 1 ) {
+                Parser_qhelp(argc, argv);
+                return 0;
+        }
+
+        line[0]='\0';
+        for ( i = 1 ;  i < argc ; i++ ) {
+                strcat(line, argv[i]);
+        }
+
+        switch ( process(line, &next, top_level, &result, &prev) ) {
+        case CMD_COMPLETE:
+                fprintf(stderr, "%s: %s\n",line, result->pc_help);
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "%s: Unknown command.\n", line);
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; result->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, result, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        result = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        }
+        return 0;
+}  
+
+
+void Parser_printhelp(char *cmd)
+{
+        char *argv[] = { "help", cmd }; 
+        Parser_help(2, argv);
+}
+
+/*************************************************************************
+ * COMMANDS                                                             *
+ *************************************************************************/
+
+
+static void print_commands(char * str, command_t * table) {
+    command_t * cmds;
+    char       buf[80];
+
+    for (cmds = table; cmds->pc_name; cmds++) {
+       if (cmds->pc_func) {
+           if (str) printf("\t%s %s\n", str, cmds->pc_name);
+           else printf("\t%s\n", cmds->pc_name);
+       }
+       if (cmds->pc_sub_cmd) {
+           if (str) {
+               sprintf(buf, "%s %s", str, cmds->pc_name);
+               print_commands(buf, cmds->pc_sub_cmd);
+           } else {
+               print_commands(cmds->pc_name, cmds->pc_sub_cmd);
+           }
+       }
+    }
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+                   size_t len)
+{
+    char *line = NULL;
+    int size = strlen(prompt) + strlen(deft) + 8;
+    char *theprompt;
+    theprompt = malloc(size);
+    assert(theprompt);
+
+    sprintf(theprompt, "%s [%s]: ", prompt, deft);
+
+    line  = readline(theprompt);
+    free(theprompt);
+
+    if ( line == NULL || *line == '\0' ) {
+       strncpy(res, deft, len);
+    } else {
+       strncpy(res, line, len);
+    }
+
+    if ( line ) {
+       free(line);
+       return res;
+    } else {
+       return NULL;
+    }
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+    int rc;
+    long result;
+    char *line;
+    int size = strlen(prompt) + 40;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+    sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+
+    fflush(stdout);
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( !line ) {
+           fprintf(stdout, "Please enter an integer.\n");
+           fflush(stdout);
+           continue;
+       }
+       if ( *line == '\0' ) {
+           free(line);
+           result =  deft;
+           break;
+       }
+       rc = Parser_arg2int(line, &result, base);
+       free(line);
+       if ( rc != 0 ) {
+           fprintf(stdout, "Invalid string.\n");
+           fflush(stdout);
+       } else if ( result > max || result < min ) {
+           fprintf(stdout, "Error: response must lie between %ld and %ld.\n",
+                   min, max);
+           fflush(stdout);
+       } else {
+           break;
+       }
+    } while ( 1 ) ;
+
+    if (theprompt)
+       free(theprompt);
+    return result;
+
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+    int result = 0;
+    char *line;
+    int size = strlen(prompt) + 8;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+
+    fflush(stdout);
+
+    if ( deft != 0 && deft != 1 ) {
+       fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n",
+               deft);
+       assert ( 0 );
+    }
+    sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y");
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( line == NULL ) {
+           result = deft;
+           break;
+       }
+       if ( *line == '\0' ) {
+           result = deft;
+           break;
+       }
+       if ( *line == 'y' || *line == 'Y' ) {
+           result = 1;
+           break;
+       }
+       if ( *line == 'n' || *line == 'N' ) {
+           result = 0;
+           break;
+       }
+       if ( line )
+           free(line);
+       fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+       fflush(stdout);
+    } while ( 1 );
+
+    if ( line )
+       free(line);
+    if ( theprompt )
+       free(theprompt);
+    return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                 int min, int max, int base)
+{
+    long result;
+    int rc;
+
+    rc = Parser_arg2int(inp, &result, base);
+
+    if ( rc == 0 ) {
+       return result;
+    } else {
+       return Parser_getint(prompt, deft, min, max, base);
+    }
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len)
+{
+    if ( inp == NULL || *inp == '\0' ) {
+       return Parser_getstr(prompt, deft, answer, len);
+    } else
+       return inp;
+}
+
+/* change a string into a number: return 0 on success. No invalid characters
+   allowed. The processing of base and validity follows strtol(3)*/
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+    char *endptr;
+
+    if ( (base !=0) && (base < 2 || base > 36) )
+       return 1;
+
+    *result = strtol(inp, &endptr, base);
+
+        if ( *inp != '\0' && *endptr == '\0' )
+                return 0;
+        else 
+                return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+        
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+        
+        return (-1);
+}
+
+int Parser_quit(int argc, char **argv)
+{
+        argc = argc;
+        argv = argv;
+        done = 1;
+        return 0;
+}
diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h
new file mode 100644 (file)
index 0000000..dead9f5
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY        100             /* Don't let history grow unbounded    */
+#define MAXARGS 100
+
+#define CMD_COMPLETE   0
+#define CMD_INCOMPLETE 1
+#define CMD_NONE       2
+#define CMD_AMBIG      3
+#define CMD_HELP       4
+
+typedef struct parser_cmd {
+       char    *pc_name;
+       int     (* pc_func)(int, char **);
+       struct parser_cmd * pc_sub_cmd;
+       char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+       char    *ac_name;
+       int      (*ac_func)(int, char **);
+       char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+       char    *type;
+       char    *server;
+       int     port;
+} network_t;
+
+int  Parser_quit(int argc, char **argv);
+void Parser_init(char *, command_t *); /* Set prompt and load command list */
+int Parser_commands(void);                     /* Start the command parser */
+void Parser_qhelp(int, char **);       /* Quick help routine */
+int Parser_help(int, char **);         /* Detailed help routine */
+void Parser_printhelp(char *);         /* Detailed help routine */
+void Parser_exit(int, char **);                /* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+                   size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+                 int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                  int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(int *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c
new file mode 100644 (file)
index 0000000..8235271
--- /dev/null
@@ -0,0 +1,1005 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <asm/byteorder.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+#include "parser.h"
+
+unsigned int portal_debug;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+
+static ptl_nid_t g_nid = 0;
+static unsigned int g_nal = 0;
+static unsigned short g_port = 0;
+
+static int g_socket_txmem = 0;
+static int g_socket_rxmem = 0;
+static int g_socket_nonagle = 1;
+
+typedef struct
+{
+        char *name;
+        int   num;
+} name2num_t;
+
+static name2num_t nalnames[] = {
+        {"tcp",                SOCKNAL},
+        {"toe",                TOENAL},
+        {"elan",       QSWNAL},
+        {"gm",         GMNAL},
+        {"scimac",      SCIMACNAL},
+        {NULL,         -1}
+};
+
+static name2num_t *
+name2num_lookup_name (name2num_t *table, char *str)
+{
+        while (table->name != NULL)
+                if (!strcmp (str, table->name))
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+static name2num_t *
+name2num_lookup_num (name2num_t *table, int num)
+{
+        while (table->name != NULL)
+                if (num == table->num)
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+int
+ptl_name2nal (char *str)
+{
+        name2num_t *e = name2num_lookup_name (nalnames, str);
+
+        return ((e == NULL) ? 0 : e->num);
+}
+
+static char *
+nal2name (int nal)
+{
+        name2num_t *e = name2num_lookup_num (nalnames, nal);
+
+        return ((e == NULL) ? "???" : e->name);
+}
+
+static int
+nid2nal (ptl_nid_t nid)
+{
+        /* BIG pragmatic assumption */
+        return ((((__u32)nid) & 0xffff0000) != 0 ? SOCKNAL : QSWNAL);
+}
+
+int
+ptl_parse_nid (ptl_nid_t *nidp, char *str)
+{
+        struct hostent *he;
+        int             a;
+        int             b;
+        int             c;
+        int             d;
+        
+        if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 &&
+            (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+            (c & ~0xff) == 0 && (d & ~0xff) == 0)
+        {
+                __u32 addr = (a<<24)|(b<<16)|(c<<8)|d;
+
+                *nidp = (ptl_nid_t)addr;
+                return (0);
+        }
+        
+        if ((('a' <= str[0] && str[0] <= 'z') ||
+             ('A' <= str[0] && str[0] <= 'Z')) &&
+             (he = gethostbyname (str)) != NULL)
+        {
+                __u32 addr = *(__u32 *)he->h_addr;
+
+                *nidp = (ptl_nid_t)ntohl(addr);  /* HOST byte order */
+                return (0);
+        }
+
+        if (sscanf (str, "%i", &a) == 1)
+        {
+                *nidp = (ptl_nid_t)a;
+                return (0);
+        }
+
+        if (sscanf (str, "%x", &a) == 1)
+        {
+                *nidp = (ptl_nid_t) a;
+                return (0);
+        }
+
+        return (-1);
+}
+
+char *
+ptl_nid2str (char *buffer, ptl_nid_t nid)
+{
+        switch (nid2nal(nid))
+        {
+        case QSWNAL:
+                sprintf (buffer, LPD64, nid);
+                return (buffer);
+
+        case SCIMACNAL:
+                sprintf (buffer, LPX64, nid);
+                return (buffer);
+                
+        case SOCKNAL: {
+                __u32           addr = htonl((__u32)nid); /* back to NETWORK byte order */
+                struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET);
+                
+                if (he != NULL)
+                        strcpy (buffer, he->h_name);
+                else
+                {
+                        addr = (__u32)nid;
+                        sprintf (buffer, "%d.%d.%d.%d", 
+                                 (addr>>24)&0xff, (addr>>16)&0xff, (addr>>8)&0xff, addr&0xff);
+                }
+                return (buffer);
+        }
+        
+        default:
+                sprintf (buffer, "nid2nal broken");
+                return (buffer);
+        }
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int ptl_initialize(int argc, char **argv) 
+{
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+        return 0;
+}
+
+
+int jt_ptl_network(int argc, char **argv)
+{
+        int  nal;
+        
+        if (argc != 2 ||
+            (nal = ptl_name2nal (argv[1])) == 0)
+        {
+                name2num_t *entry;
+                
+                fprintf(stderr, "usage: %s \n", argv[0]);
+                for (entry = nalnames; entry->name != NULL; entry++)
+                        fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name);
+                fprintf(stderr, ">\n");
+        }
+        else
+                g_nal = nal;
+
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+int jt_ptl_connect(int argc, char **argv)
+{
+        if (argc < 2) {
+        usage:
+                fprintf(stderr, "usage: %s <hostname port [xi]> or <elan ID>\n",
+                        argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                ptl_nid_t peer_nid;
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                struct sockaddr_in srvaddr;
+                char *flag;
+                int fd, rc;
+                int nonagle = 0;
+                int rxmem = 0;
+                int txmem = 0;
+                int bind_irq = 0;
+                int xchange_nids = 0;
+                int o;
+                int olen;
+                
+                if (argc < 3) {
+                        goto usage;
+                }
+
+                he = gethostbyname(argv[1]);
+                if (!he) {
+                        fprintf(stderr, "gethostbyname error: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                g_port = atol(argv[2]);
+
+                if (argc > 3)
+                        for (flag = argv[3]; *flag != 0; flag++)
+                                switch (*flag)
+                                {
+                                case 'i':
+                                        bind_irq = 1;
+                                        break;
+                                        
+                                case 'x':
+                                        xchange_nids = 1;
+                                        break;
+
+                                default:
+                                        fprintf (stderr, "unrecognised flag '%c'\n",
+                                                 *flag);
+                                        return (-1);
+                                }
+                
+                memset(&srvaddr, 0, sizeof(srvaddr));
+                srvaddr.sin_family = AF_INET;
+                srvaddr.sin_port = htons(g_port);
+                srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr;
+        
+                fd = socket(PF_INET, SOCK_STREAM, 0);
+                if ( fd < 0 ) {
+                        fprintf(stderr, "socket() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                if (g_socket_nonagle)
+                {
+                        o = 1;
+                        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_rxmem != 0)
+                {
+                        o = g_socket_rxmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_txmem != 0)
+                {
+                        o = g_socket_txmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                if ( rc == -1 ) { 
+                        fprintf(stderr, "connect() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                olen = sizeof (txmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0)
+                        fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno));
+                olen = sizeof (rxmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0)
+                        fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno));
+                olen = sizeof (nonagle);
+                if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0)
+                        fprintf (stderr, "Can't get nagle: %s\n", strerror (errno));
+
+                if (xchange_nids) {
+                        
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = g_nal;
+                        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+                        if (rc != 0)
+                        {
+                                fprintf (stderr, "failed to get my nid: %s\n",
+                                         strerror (errno));
+                                close (fd);
+                                return (-1);
+                        }
+                        
+                        rc = exchange_nids (fd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (fd);
+                                return (-1);
+                        }
+                }
+                else
+                        peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */
+
+                printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1],
+                       peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled");
+
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = fd;
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to register fd with portals: "
+                                "%s\n", strerror(errno));
+                        close (fd);
+                        return -1;
+                }
+
+                g_nid = peer_nid;
+                printf("Connection to "LPX64" registered with socknal\n", g_nid);
+
+                rc = close(fd);
+                if (rc) {
+                        fprintf(stderr, "close failed: %d\n", rc);
+                }
+        } else if (g_nal == QSWNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == GMNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == SCIMACNAL) {
+                unsigned int    tmpnid;
+                if(sscanf(argv[1], "%x", &tmpnid) == 1) {
+                        g_nid=tmpnid;
+                }
+                else {
+                        fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]);
+                }
+
+
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+        }
+
+        return 0;
+}
+
+int jt_ptl_disconnect(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Disconnecting ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to remove connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_push_connection (int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Pushing ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to push connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'push' doesn't make any sense for elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'push' doesn't make any sense for GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'push' doesn't make any sense for SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_ping(int argc, char **argv)
+{
+        int       rc;
+        ptl_nid_t nid;
+        long      count   = 1;
+        long      size    = 4;
+        long      timeout = 1;
+        struct portal_ioctl_data data;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]);
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+        
+        if (argc > 2)
+        {
+                count = atol(argv[2]);
+
+                if (count < 0 || count > 20000) 
+                {
+                        fprintf(stderr, "are you insane?  %ld is a crazy count.\n", count);
+                        return -1;
+                }
+        }
+        
+        if (argc > 3)
+                size= atol(argv[3]);
+
+        if (argc > 4)
+                timeout = atol (argv[4]);
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_count   = count;
+        data.ioc_size    = size;
+        data.ioc_nid     = nid;
+        data.ioc_nal     = g_nal;
+        data.ioc_timeout = timeout;
+        
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data);
+        if (rc) {
+                fprintf(stderr, "failed to start pinger: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_ptl_mynid(int argc, char **argv)
+{
+        int rc;
+        struct hostent *h;
+        char buf[1024], *hostname;
+        struct portal_ioctl_data data;
+        ptl_nid_t mynid;
+        
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                fprintf(stderr, "hostname defaults to the hostname of the "
+                        "machine.\n");
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (g_nal == QSWNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for elan.\n");
+                return -1;
+        } else  if (g_nal == GMNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for GM.\n");
+                return -1;
+        } else  if (g_nal == SCIMACNAL) {
+                fprintf(stderr, "'mynid' doesn't make any sense for SCI.\n");
+                return -1;
+        } 
+        
+        if (g_nal != SOCKNAL && g_nal != TOENAL) {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        if (argc == 1) {
+                if (gethostname(buf, sizeof(buf)) != 0) {
+                        fprintf(stderr, "gethostname failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+                hostname = buf;
+        } else {
+                hostname = argv[1];
+        }
+
+        h = gethostbyname(hostname);
+
+        if (!h) {
+                fprintf(stderr, "cannot get address for host '%s': %d\n",
+                        hostname, h_errno);
+                return -1;
+        }
+        mynid = (ptl_nid_t)ntohl (*(__u32 *)h->h_addr);      /* HOST byte order */
+        
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = mynid;
+        data.ioc_nal = g_nal;
+        data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+        if (rc < 0)
+                fprintf(stderr, "IOC_PORTAL_REGISTER_MYNID failed: %s\n",
+                       strerror(errno));
+        else
+                printf("registered my nid "LPX64" (%s)\n", mynid, hostname);
+        return 0;
+}
+
+int
+jt_ptl_fail_nid (int argc, char **argv)
+{
+        int                      rc;
+        ptl_nid_t                nid;
+        unsigned int             threshold;
+        struct portal_ioctl_data data;
+
+        if (argc < 2 || argc > 3)
+        {
+                fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]);
+                return (0);
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (!strcmp (argv[1], "_all_"))
+                nid = PTL_NID_ANY;
+        else if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (argc < 3)
+                threshold = PTL_MD_THRESH_INF;
+        else if (sscanf (argv[2], "%i", &threshold) != 1) {
+                fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]);
+                return (-1);
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        data.ioc_nid = nid;
+        data.ioc_count = threshold;
+        
+        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data);
+        if (rc < 0)
+                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
+                         strerror (errno));
+        else
+                printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]);
+        
+        return (0);
+}
+
+int
+jt_ptl_rxmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+
+                g_socket_rxmem = size;
+        }
+        printf ("Socket rmem = %d\n", g_socket_rxmem);        
+        return (0);
+}
+
+int
+jt_ptl_txmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_txmem = size;
+        }
+        printf ("Socket txmem = %d\n", g_socket_txmem);
+        return (0);
+}
+
+int
+jt_ptl_nagle (int argc, char **argv)
+{
+        int enable;
+
+        if (argc > 1)
+        {
+                if (Parser_bool (&enable, argv[1]) != 0)
+                {
+                        fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_nonagle = !enable;
+        }
+        printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled");
+        return (0);
+}
+
+int
+jt_ptl_add_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        ptl_nid_t                gateway_nid;
+        int                      gateway_nal;
+        int                      rc;
+        
+        if (argc < 3)
+        {
+                fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        gateway_nal = nid2nal (gateway_nid);
+
+        if (ptl_parse_nid (&nid1, argv[2]) != 0)
+        {
+                fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]);
+                return (-1);
+        }
+
+        if (argc < 4)
+                nid2 = nid1;
+        else if (ptl_parse_nid (&nid2, argv[3]) != 0)
+        {
+                fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = gateway_nid;
+        data.ioc_nal = gateway_nal;
+        data.ioc_nid2 = MIN (nid1, nid2);
+        data.ioc_nid3 = MAX (nid1, nid2);
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_del_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid;
+        int                      rc;
+        
+        if (argc < 2)
+        {
+                fprintf (stderr, "usage: %s targetNID\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = nid;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_print_routes (int argc, char **argv)
+{
+        char                      buffer[3][128];
+        struct portal_ioctl_data  data;
+        int                       rc;
+        int                       index;
+        int                      gateway_nal;
+        ptl_nid_t                gateway_nid;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        
+        
+        for (index = 0;;index++)
+        {
+                PORTAL_IOC_INIT(data);
+                data.ioc_count = index;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data);
+                if (rc != 0)
+                        break;
+
+                gateway_nal = data.ioc_nal;
+                gateway_nid = data.ioc_nid;
+                nid1 = data.ioc_nid2;
+                nid2 = data.ioc_nid3;
+                
+                printf ("%8s %18s : %s - %s\n", 
+                        nal2name (gateway_nal), 
+                        ptl_nid2str (buffer[0], gateway_nid),
+                        ptl_nid2str (buffer[1], nid1),
+                        ptl_nid2str (buffer[2], nid2));
+        }
+        return (0);
+}
+
diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c
new file mode 100644 (file)
index 0000000..d38bd4a
--- /dev/null
@@ -0,0 +1,64 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+#include "parser.h"
+
+
+command_t list[] = {
+        {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
+        {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: <hostname port> | <id> for tcp/elan respectively)"},
+        {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"},
+        {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"},
+        {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
+        {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
+        {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+        {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+        {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
+        {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
+        {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
+        {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (ptl_initialize(argc, argv) < 0)
+                exit(1);
+
+        Parser_init("ptlctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        return 0;
+}
diff --git a/lustre/portals/utils/routerstat.c b/lustre/portals/utils/routerstat.c
new file mode 100644 (file)
index 0000000..37da12c
--- /dev/null
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+double
+timenow ()
+{
+   struct timeval tv;
+   
+   gettimeofday (&tv, NULL);
+   return (tv.tv_sec + tv.tv_usec / 1000000.0);
+}
+
+void
+do_stat (int fd)
+{
+   static char  buffer[1024];
+   static double last = 0.0;
+   double now;
+   double t;
+   long long bytes;
+   long      packets;
+   long      errors;
+   long      depth;
+   int    n;
+   
+   lseek (fd, 0, SEEK_SET);
+   now = timenow();
+   n = read (fd, buffer, sizeof (buffer));
+   if (n < 0)
+   {
+      fprintf (stderr, "Can't read statfile\n");
+      exit (1);
+   }    
+   buffer[n] = 0;
+   
+   n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth);
+   
+   if (n < 3)
+   {
+      fprintf (stderr, "Can't parse statfile\n");
+      exit (1);
+   }
+   
+   if (last == 0.0)
+      printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", 
+             bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors);
+   else
+   {
+      t = now - last;
+
+      printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", 
+             bytes, ((double)bytes)/((1<<20) * t),
+             packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t),
+             errors, (long)(errors/t));
+   }
+
+   if (n == 4)
+      printf (" (%ld)\n", depth);
+   else
+      printf ("\n");
+
+   fflush (stdout);
+   
+   lseek (fd, 0, SEEK_SET);
+   write (fd, "\n", 1);
+   last = timenow();
+}
+
+int main (int argc, char **argv)
+{
+   int  interval = 0;
+   int  fd;
+   
+   if (argc > 1)
+      interval = atoi (argv[1]);
+
+   fd = open ("/proc/sys/portals/router", O_RDWR);
+   if (fd < 0)
+   {
+      fprintf (stderr, "Can't open stat: %s\n", strerror (errno));
+      return (1);
+   }
+   
+   do_stat (fd);
+   if (interval == 0)
+      return (0);
+   
+   for (;;)
+   {
+      sleep (interval);
+      do_stat (fd);
+   }
+}